1 Introduction

2 Data preparation

2.1 Read in MUTANT TRAP SCORES

  • Generate ‘FullDataset’ i.e ‘Peptide, Dataset, HLA, PredictionScore’. These are the score generated for mutants, which are single amino acid substitutions away from our Wuhan CoV2 targets ofinterest
# Read in TRAP predictoins
FullDataset=fread("TRAPP/DATA_V3_SARS2_ONLY/pred_scores/prott5_xl_bfd_peptides_sars2_mut_test_prediction_withlabel_2.txt")%>% distinct()
# Read in information supplied to TRAP, to link pMHC with score. This includes all pMHC generated  via in silico mutagenesis
INPUT_DATA_TRAPP=fread("MUTAGENESIS_SUBSET_OMI_WUHAN_HLA_RUN_CHL_DNN_V4_INC_SUBVAR_WTNB_MTBINDER_ALLCOV2_MUTAGENESIS.txt")%>%
         distinct()%>% mutate(BA_Rank = as.numeric(BA_Rank))%>%
        mutate(nlog2Rank = - log2(BA_Rank))

INPUT_DATA_TRAPP=INPUT_DATA_TRAPP %>% mutate(nlog2Rank = round(nlog2Rank, digits=4))

FullDataset=FullDataset%>% mutate(nlog2Rank = round(nlog2Rank, digits=4)) # Use nlog2Rank to link pMHC with TRAP output score
FullDataset=FullDataset %>% inner_join(INPUT_DATA_TRAPP)
## Joining, by = c("Peptide", "nlog2Rank", "Dataset", "Predicted_Binding")
# Cleaning
FullDataset %>% select(Dataset)%>% table
## .
##    Mutagenesis    OmicronTest SUBVAR_MT_TEST SUBVAR_WT_TEST      WuhanTest 
##          86984            431            324            324            477
FullDataset=FullDataset%>%dplyr::rename(Prediction=prediction)
FullDataset=FullDataset %>% select(Peptide, Dataset, Predicted_Binding, Prediction)
FullDataset=FullDataset %>% filter(Dataset == "Mutagenesis")

#FullDataset %>% filter(Peptide %in% MUTATIONS$VariantAlignment)

#FullDataset %>% filter(Peptide %in% "SHRRARSVA")
FullDataset %>% select(Predicted_Binding) %>% table()
## .
## HLA-A0101 HLA-A0201 HLA-A0202 HLA-A0206 HLA-A0211 HLA-A0301 HLA-A0302 HLA-A1101 
##      2258      1355      1339      1610      1467      1373      1374      1497 
## HLA-A2301 HLA-A2402 HLA-A2501 HLA-A2601 HLA-A2902 HLA-A3001 HLA-A3002 HLA-A3101 
##      1724      1794      1371      1533      2355       935      2161      1091 
## HLA-A3201 HLA-A3303 HLA-A6801 HLA-A6802 HLA-A7401 HLA-B0702 HLA-B0801 HLA-B1402 
##      1411      1021      1420       750      1335       935       862      1109 
## HLA-B1501 HLA-B1503 HLA-B1801 HLA-B2705 HLA-B3501 HLA-B3503 HLA-B3801 HLA-B4001 
##      1680      1718       718       281      1819      1002       764       639 
## HLA-B4002 HLA-B4006 HLA-B4201 HLA-B4402 HLA-B4403 HLA-B4501 HLA-B5001 HLA-B5101 
##       407       270       882       887       870       248       387       754 
## HLA-B5201 HLA-B5301 HLA-B5701 HLA-B5801 HLA-B5802 HLA-C0102 HLA-C0202 HLA-C0210 
##      1072       765       743       790      1039      2081      2185      2185 
## HLA-C0302 HLA-C0303 HLA-C0304 HLA-C0401 HLA-C0501 HLA-C0602 HLA-C0701 HLA-C0702 
##      1990      1593      1593      2022      1184      1445      1892      2488 
## HLA-C0801 HLA-C0802 HLA-C1202 HLA-C1203 HLA-C1402 HLA-C1502 HLA-C1601 HLA-C1701 
##      1812      1395      2143      1948      2552      1203      1763      1695

2.2 Get full dataset, all WT and MT.

  • We need to find all pMHC rank scores, to know what pMHC to include, exclude and impute.
  • This is because e.g., I am not including sampels where both WT-MT dont bind MHC.
# Class I OMICRON MUTANTS
TEST_DATA_LOCATION = "NETMHCPAN_CLASSI_MUTAGENESIS/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
 Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
 Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
 Netmhcpanres=Netmhcpanres %>% select(! c(file,Pos,ID,core,icore)) %>% mutate(Binder = ifelse(NB==1,"BINDER","NONBINDER"))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=HLA_Allele,pattern = "\\:",replacement = ""))

 NETMHC_CLASSI=Netmhcpanres
NETMHC_CLASSI=NETMHC_CLASSI %>% mutate(MT_nM_41 = round(50000^(1-`BA-score`) ,digits=5))
NETMHC_CLASSI=NETMHC_CLASSI %>% mutate(Binder = ifelse(BA_Rank>=2, "NONBINDER","BINDER"))
#NETMHC_CLASSI = NETMHC_CLASSI %>% filter(Binder == "BINDER")


NETMHC_CLASSI %>% head
## # A tibble: 6 × 10
##   Peptide    `EL-score` EL_Rank `BA-score` BA_Rank    Ave    NB HLA_Allele Binder
##   <chr>           <dbl>   <dbl>      <dbl>   <dbl>  <dbl> <int> <chr>      <chr> 
## 1 AASHNIALIW     0.005     7.24     0.0896    8.15 0.005      0 HLA-A0101  NONBI…
## 2 ACSHNIALIW     0.0032    9.36     0.0788   10.4  0.0032     0 HLA-A0101  NONBI…
## 3 ADSHNIALIW     0.0052    7.08     0.0843    9.16 0.0052     0 HLA-A0101  NONBI…
## 4 AESHNIALIW     0.0046    7.61     0.0792   10.4  0.0046     0 HLA-A0101  NONBI…
## 5 AFSHNIALIW     0.003     9.70     0.0857    8.87 0.003      0 HLA-A0101  NONBI…
## 6 AGSHNIALIW     0.003     9.68     0.0742   11.7  0.003      0 HLA-A0101  NONBI…
## # … with 1 more variable: MT_nM_41 <dbl>
NETMHC_CLASSI %>% select(Peptide) %>% distinct()%>% nrow
## [1] 12362
MUTAGENESIS_BINDERS=NETMHC_CLASSI %>% select(Peptide, HLA_Allele, BA_Rank,Binder)%>% mutate(Dataset="Mutagenesis")%>% distinct()
MUTAGENESIS_BINDERS=MUTAGENESIS_BINDERS%>% dplyr::rename(Predicted_Binding=HLA_Allele, AASeq2=Peptide, MT_Binder=Binder, MT_BA_Rank = BA_Rank)
NETMHC_CLASSI %>% filter(Peptide %in% "EELKKLLEQW")
## # A tibble: 64 × 10
##    Peptide  `EL-score` EL_Rank `BA-score` BA_Rank    Ave    NB HLA_Allele Binder
##    <chr>         <dbl>   <dbl>      <dbl>   <dbl>  <dbl> <int> <chr>      <chr> 
##  1 EELKKLL…     0.0021   12.0      0.0297    51.9 0.0021     0 HLA-A0101  NONBI…
##  2 EELKKLL…     0.0002   34.8      0.0175    77.8 0.0002     0 HLA-A0201  NONBI…
##  3 EELKKLL…     0.0005   33.5      0.0353    67.5 0.0005     0 HLA-A0202  NONBI…
##  4 EELKKLL…     0.0005   33.9      0.0269    76.9 0.0005     0 HLA-A0206  NONBI…
##  5 EELKKLL…     0.0002   47.4      0.0206    78.6 0.0002     0 HLA-A0211  NONBI…
##  6 EELKKLL…     0.0001   31.7      0.0197    76.6 0.0001     0 HLA-A0301  NONBI…
##  7 EELKKLL…     0.0002   32        0.0202    72.4 0.0002     0 HLA-A0302  NONBI…
##  8 EELKKLL…     0.0001   30.2      0.0206    69.7 0.0001     0 HLA-A1101  NONBI…
##  9 EELKKLL…     0.0037    6.52     0.0709    18.1 0.0037     0 HLA-A2301  NONBI…
## 10 EELKKLL…     0.0022    7.97     0.0566    18.1 0.0022     0 HLA-A2402  NONBI…
## # … with 54 more rows, and 1 more variable: MT_nM_41 <dbl>
  • Read in information regarding mutations for Wuhan eps with mutation in BA1,
  • Read in Wuhan set pMHC scores
MUTATIONS = readRDS("OMICRON_EPITOPE_MUTATIONS.rds")
WUHAN_PEPTIDES_BINDING_SCORES=readRDS("ORIGINAL_PEPTIDES_BINDING_SCORES_V2.rds")
WUHAN_PEPTIDES_BINDING_SCORES=WUHAN_PEPTIDES_BINDING_SCORES %>% filter(Peptide %in% MUTATIONS$Peptide)

WUHAN_PEPTIDES_BINDING_SCORES %>% head
## # A tibble: 6 × 15
##   Peptide    Predicted_Binding `Thalf(h)`  `EL-score` EL_Rank `BA-score` BA_Rank
##   <chr>      <chr>             <chr>       <chr>      <chr>   <chr>      <chr>  
## 1 AEHVNNSY   HLA-A0101,HLA-A0… 0.2205,0.1… 0.0529,0,… 1.914,… 0.0989,0.… 6.7589…
## 2 AKSHNIALIW HLA-A0101,HLA-A0… 0.2148,0.1… 0.0028,1e… 10.094… 0.0679,0.… 13.890…
## 3 APGQTGKIA  HLA-A0101,HLA-A0… 0.0831,0.0… 2e-04,0,1… 40.307… 0.0146,0.… 86.212…
## 4 APRITFGGP  HLA-A0101,HLA-A0… 0.0697,0.0… 1e-04,0,0… 68.333… 0.0161,0.… 82.743…
## 5 CNDPFLGVY  HLA-A0101,HLA-A0… 0.3908,0.1… 0.3985,1e… 0.3936… 0.4964,0.… 0.1649…
## 6 CNDPFLGVYY HLA-A0101,HLA-A0… 0.5458,0.1… 0.4072,0,… 0.3849… 0.509,0.0… 0.1504…
## # … with 8 more variables: Ave <chr>, Binder <chr>, nM_41 <chr>, nM <chr>,
## #   Aff_Binder <chr>, Length <int>, HydrophobicCount <int>, HydroFraction <dbl>
# cleaning
WUHAN_PEPTIDES_BINDING_SCORES=WUHAN_PEPTIDES_BINDING_SCORES %>% select(Peptide, Predicted_Binding, BA_Rank, Binder)%>%
        separate_rows_(cols = c("Predicted_Binding","BA_Rank","Binder"),sep=",")%>% mutate(Dataset = "WuhanTest")

WUHAN_PEPTIDES_BINDING_SCORES=WUHAN_PEPTIDES_BINDING_SCORES %>% dplyr::rename(AASeq1=Peptide)%>% select(!Dataset)
  • Read in Omicron mutated pMHC scores
OMI_PEPTIDES_BINDING_SCORES=readRDS("OMICRON_PEPTIDES_BINDING_SCORES_V2.rds")
OMI_PEPTIDES_BINDING_SCORES %>% head
## # A tibble: 6 × 13
##   VariantAlignment MT_Predicted_Bind… `MT_Thalf(h)`   `MT_EL-score`  MT_EL_Rank 
##   <chr>            <chr>              <chr>           <chr>          <chr>      
## 1 AEYVNNSY         HLA-A0101,HLA-A02… 0.2295,0.1438,… 0.0412,0,1e-0… 2.2565,55.…
## 2 AKSHNITLIW       HLA-A0101,HLA-A02… 0.2395,0.1924,… 0.0048,1e-04,… 7.3782,47.…
## 3 ALRITFGGP        HLA-A0101,HLA-A02… 0.0843,0.1071,… 0,9e-04,0.004… 73.4615,18…
## 4 APGQTGNIA        HLA-A0101,HLA-A02… 0.0906,0.1022,… 5e-04,0,1e-04… 29.3103,52…
## 5 CNDPFLDHK        HLA-A0101,HLA-A02… 0.1508,0.093,0… 0.0105,0,1e-0… 4.7481,63.…
## 6 CNDPFLDHKN       HLA-A0101,HLA-A02… 0.0855,0.0757,… 2e-04,0,0,0,0… 46,90,95,9…
## # … with 8 more variables: MT_BA-score <chr>, MT_BA_Rank <chr>, MT_Ave <chr>,
## #   MT_Binder <chr>, MT_nM_41 <chr>, Predicted_Binding <chr>, MT_nM <chr>,
## #   MT_Aff_Binder <chr>
OMI_PEPTIDES_BINDING_SCORES=OMI_PEPTIDES_BINDING_SCORES %>% select(VariantAlignment, Predicted_Binding, MT_BA_Rank, MT_Binder)%>%
        separate_rows_(cols = c("Predicted_Binding","MT_BA_Rank","MT_Binder"),sep=",")%>%
        dplyr::rename(AASeq2=VariantAlignment)%>%
        mutate(Dataset = "OmicronTest")

OMI_PEPTIDES_BINDING_SCORES %>% filter(AASeq2 == "SHRRARSVA")
## # A tibble: 64 × 5
##    AASeq2    Predicted_Binding MT_BA_Rank MT_Binder Dataset    
##    <chr>     <chr>             <chr>      <chr>     <chr>      
##  1 SHRRARSVA HLA-A0101         63.4561    NONBINDER OmicronTest
##  2 SHRRARSVA HLA-A0201         81.6515    NONBINDER OmicronTest
##  3 SHRRARSVA HLA-A0202         77.2365    NONBINDER OmicronTest
##  4 SHRRARSVA HLA-A0206         83.1944    NONBINDER OmicronTest
##  5 SHRRARSVA HLA-A0211         78.4834    NONBINDER OmicronTest
##  6 SHRRARSVA HLA-A0301         34.8778    NONBINDER OmicronTest
##  7 SHRRARSVA HLA-A0302         53.217     NONBINDER OmicronTest
##  8 SHRRARSVA HLA-A1101         52.194     NONBINDER OmicronTest
##  9 SHRRARSVA HLA-A2301         42.1591    NONBINDER OmicronTest
## 10 SHRRARSVA HLA-A2402         37.9478    NONBINDER OmicronTest
## # … with 54 more rows
# Mutant data consists of the Mutagenesis set + the Omicron epitope.
MT_DATA=MUTAGENESIS_BINDERS %>% rbind(OMI_PEPTIDES_BINDING_SCORES) %>% select(!Dataset)%>% distinct()
# WT data consists of the Wuhan set
WT_DATA = WUHAN_PEPTIDES_BINDING_SCORES
#FullDataset %>% mutate(Type = ifelse)
# Read in 'INSILICO_MUTAGENESIS_WUHANOMICRON' - this links WT to each respective mutant.
# Filter for substitutions and 9/10mers
INSILICO_MUTAGENESIS_WUHANOMICRON=readRDS("INSILICO_MUTAGENESIS_WUHANOMICRON.rds") %>% filter(Mut_type == "Substitution")
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% mutate(Length = nchar(AASeq1))%>% filter(Length %in% c(9,10))
# Inner join all the data. So for each row we have WTPeptide, MTPeptide, HLA, Rank scores for WT and MT.
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% left_join(MT_DATA)%>% drop_na()
## Joining, by = "AASeq2"
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% left_join(WUHAN_PEPTIDES_BINDING_SCORES)%>% drop_na()
## Joining, by = c("AASeq1", "Predicted_Binding")

3 Exclude WT AND MT NON BINDERS

INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON%>% filter(! (Binder == "NONBINDER" & MT_Binder == "NONBINDER"))
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% select(order(colnames(INSILICO_MUTAGENESIS_WUHANOMICRON)))

4 Join everything

# AASeq2 is the mutant peptide.
FullDataset=FullDataset %>% dplyr::rename(AASeq2=Peptide)

ANTIJOIN_SET=INSILICO_MUTAGENESIS_WUHANOMICRON %>% anti_join(FullDataset)
## Joining, by = c("AASeq2", "Predicted_Binding")
# Join by column AASeq2. Prediction here is the TRAP score for mutant peptides.
INSILICO_MUTAGENESIS_WUHANOMICRON = INSILICO_MUTAGENESIS_WUHANOMICRON%>% inner_join(FullDataset)
## Joining, by = c("AASeq2", "Predicted_Binding")
ANTIJOIN_SET%>% mutate(GROUP = paste0(Binder, "_",MT_Binder))%>% select(GROUP) %>% table
## .
##    BINDER_BINDER BINDER_NONBINDER NONBINDER_BINDER 
##              215            28581               12
# Deal w binder binder. Somehow theyrer not in fulldataset, maybe we can't link them directly by matching nlog2Rank.
# So find the scores and create the data
BINDER_BINDER=ANTIJOIN_SET %>% mutate(GROUP = paste0(Binder, "_",MT_Binder)) %>% filter(GROUP=="BINDER_BINDER")

BINDER_BINDER=BINDER_BINDER %>%
        left_join(fread("TRAPP/DATA_V3_SARS2_ONLY/pred_scores/prott5_xl_bfd_peptides_sars2_mut_test_prediction_withlabel_2.txt")%>%
                          distinct()%>%
                          dplyr::rename(AASeq2=Peptide)) %>%
        select(AASeq1, AASeq2, BA_Rank, Binder, Length, MT_BA_Rank, MT_Binder, Mut_type, Predicted_Binding, prediction) %>%
        dplyr::rename(Prediction=prediction)%>% distinct()
## Joining, by = c("AASeq2", "Predicted_Binding")
# Deal w/ binder non-binder: impute a TRAP score of 0.01 for the MT.
BINDER_NONBINDER = ANTIJOIN_SET%>% mutate(GROUP = paste0(Binder, "_",MT_Binder)) %>% filter(GROUP=="BINDER_NONBINDER")
BINDER_NONBINDER = BINDER_NONBINDER %>% select(! GROUP) %>% mutate(Prediction=0.01)

BINDER_NONBINDER%>% select(AASeq2, MT_BA_Rank, Predicted_Binding) %>%
        dplyr::rename(Peptide=AASeq2, BA_Rank=MT_BA_Rank)%>% distinct() %>%
        readr::write_csv(file="NONBINDERS_FOR_TRAPP_MUTAGENESIS.csv")

4.1 Bind everything together.

INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% select(! Dataset) %>%
         distinct() %>% rbind(BINDER_BINDER, BINDER_NONBINDER)%>%
        mutate(AASeq2_Prediction= Prediction)

4.2 Now read in WT score. These are those generated for the wuhan pMHC

# FullDataset now contains the WT TRAP score.
FullDataset=fread("TRAPP/DATA_V3_SARS2_ONLY/pred_scores/prott5_xl_bfd_peptides_sars2_mut_test_prediction_withlabel_2.txt")%>% distinct()
# Link to the specific pMHC via nlog2Rank. This is done because TRAP output lacks the particular MHC for which the observation was generated.
INPUT_DATA_TRAPP=fread("MUTAGENESIS_SUBSET_OMI_WUHAN_HLA_RUN_CHL_DNN_V4_INC_SUBVAR_WTNB_MTBINDER_ALLCOV2_MUTAGENESIS.txt")%>%
         distinct()%>% mutate(BA_Rank = as.numeric(BA_Rank))%>%
        mutate(nlog2Rank = - log2(BA_Rank))

INPUT_DATA_TRAPP=INPUT_DATA_TRAPP %>% mutate(nlog2Rank = round(nlog2Rank, digits=4))
FullDataset=FullDataset%>% mutate(nlog2Rank = round(nlog2Rank, digits=4))
FullDataset=FullDataset %>% inner_join(INPUT_DATA_TRAPP)
## Joining, by = c("Peptide", "nlog2Rank", "Dataset", "Predicted_Binding")
# Clean. Filter for Wuhan WT peptide
FullDataset %>% select(Dataset)%>% table
## .
##    Mutagenesis    OmicronTest SUBVAR_MT_TEST SUBVAR_WT_TEST      WuhanTest 
##          86984            431            324            324            477
FullDataset=FullDataset%>%dplyr::rename(Prediction=prediction)
FullDataset=FullDataset %>% select(Peptide, Dataset, Predicted_Binding, Prediction)
FullDataset=FullDataset %>% filter(Peptide %in% MUTATIONS$Peptide)
# AASeq1 is the WT peptide.
FullDataset=FullDataset %>% select(! Dataset)%>% distinct()%>% dplyr::rename(AASeq1=Peptide, WT_Prediction=Prediction)
# Leftjoin with our mutagenesis dataset
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% left_join(FullDataset)
## Joining, by = c("AASeq1", "Predicted_Binding")
# Impute a score for those where WT was a nonbinder but MT now binds. This MT may now be immunogenic thus must be analysed.
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>%
        mutate(WT_Prediction = replace(WT_Prediction, (Binder == "NONBINDER" & MT_Binder == "BINDER"), 0.01))

# Are there any NA generated from the left join? No, this means we have a WT and MT score for each observation.
colnms = colnames(INSILICO_MUTAGENESIS_WUHANOMICRON)
INSILICO_MUTAGENESIS_WUHANOMICRON %>%
  filter_at(vars(all_of(colnms)), any_vars(is.na(.)))
## Empty data.table (0 rows and 12 cols): AASeq1,AASeq2,BA_Rank,Binder,Length,MT_BA_Rank...
# Cleaning.
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>%
        select(!Prediction)%>% dplyr::rename(Prediction = AASeq2_Prediction, WuhanScore = WT_Prediction)
FULL_PREDICTIONS_DT=INSILICO_MUTAGENESIS_WUHANOMICRON%>% dplyr::rename(Peptide=AASeq1)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% filter(! Peptide == AASeq2)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% dplyr::rename(TRAPP_MUTANT_Prediction=Prediction)%>%
        dplyr::rename(TRAPP_Prediction = WuhanScore)%>% mutate(BA_Rank = as.numeric(BA_Rank))%>%
        mutate(MT_BA_Rank = as.numeric(MT_BA_Rank))

# Now, we want to produce an overall score incorporating both (TRAP score * MHC binding score)
# MHCScore is produced by a negative logit function of the Rank score. X-2 as the threshold for 2 being the cutoff threshold for binding.
# 1.5 selected abritrarily to reduce the magnitude of the curve.. so rank scores of e.g., 2.1 aren't treated as harshly as they may be with a higher value such as 5. We are here beign a little conservative.

#negative logistic function
neg_logit_function = function(x) {
  1/(1+exp(1.5*(x-2)))
}
# Transform WT and MT MHCScore
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(MHCScore = neg_logit_function(BA_Rank))%>% mutate(MT_MHCScore = neg_logit_function(MT_BA_Rank))
#FULL_MUTATIONS_DT %>% mutate(MHCScore = min_max_norm(BA_Rank))%>% mutate(MT_MHCScore = min_max_norm(MT_BA_Rank))
# Calculate the Wuhan Immunogenicity score and the MT Immunogenicity score (Prediction)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(WuhanScore = TRAPP_Prediction * MHCScore)%>% mutate(Prediction = TRAPP_MUTANT_Prediction* MT_MHCScore)
# To avoid huge skews at the limit approaching zero, we replace anything with <0.01 with 0.01
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(WuhanScore = replace(WuhanScore, WuhanScore<0.01, 0.01))%>%
  mutate(Prediction = replace(Prediction, Prediction<0.01, 0.01))
#FULL_MUTATIONS_DT=FULL_MUTATIONS_DT %>% rbind(NA_SET_BINDER_NONBINDER)%>% rbind(NA_SET_NONBINDER_BINDER)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(WuhanScore = round(WuhanScore, digits=4))%>%
        mutate(Prediction = round(Prediction, digits=4))%>% distinct()
# Anything rounded to == 0 is set to 0.01
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(WuhanScore = replace(WuhanScore, WuhanScore==0.0, 0.01))%>%
  mutate(Prediction = replace(Prediction, Prediction==0.0, 0.01))
FULL_PREDICTIONS_DT %>% group_by(Peptide)%>% dplyr::summarise(meanScore = mean(WuhanScore))%>% slice_min(order_by = meanScore)
## # A tibble: 1 × 2
##   Peptide   meanScore
##   <chr>         <dbl>
## 1 NGVEGFNCY     0.204
# Remove likely FN
FN_EPS=FULL_PREDICTIONS_DT %>% group_by(Peptide)%>% dplyr::summarise(maxScore = max(WuhanScore))%>% filter(maxScore < 0.45)%>% pull(Peptide)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT%>% filter(! Peptide %in% FN_EPS)
# Get rid of pMHC observations where WT and MT unlikely to be immunogenic.
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% filter(!(Prediction < 0.35 & WuhanScore < 0.35))

FULL_PREDICTIONS_DT %>% dplyr::rename("Variant Peptide"=AASeq2, "HLA Allele"=Predicted_Binding,MutantScore=Prediction,TRAPP_Wuhan=TRAPP_Prediction)%>%
    readr::write_csv(file="/Users/paulbuckley/Nexus365/WIMM CCB - Koohy Group - Documents/Koohy Group/Effect_Mutation_Tcells/V9/WORKING_VERSION/Supplementary Tables/Table3_MutagenesisPredictions.csv")
#saveRDS(FULL_PREDICTIONS_DT, file="INSILICO_MUTAGENESIS_COMPILED_PREDICTIONS_SUBONLY.rds")

5 Create neighbor network RDS files

5.1 change functions

  • Adapted slightly from Repitope. Paralellisation uses out of date packages and won’t work on cluster. Code was adapted to run on a single thread
neighborNetwork <- function(
  peptideSet,
  numSet=NULL,
  directed=F,
  weighted=F,
  coreN=1
){
  # Start calculation
  set.seed(12345)
  time.start <- proc.time()

  # Explore neighbor pairs

  peptideList <- split(peptideSet, nchar(peptideSet))
  cat("Searching neighbors by single-aa substitutions...\n")
  dt_neighbor_sub <- foreach::foreach(i=1:length(peptideList), .inorder=F)%do%{
    l <- nchar(peptideList[[i]][1])
    dt_edges <- apply(data.table::CJ(1:l, Biostrings::AA_STANDARD), 1,
                      function(v){
                        aa_pos <- as.numeric(v[1])
                        aa_sub <- v[2]
                        s <- peptideList[[i]]
                        s_prime <- s
                        stringr::str_sub(s_prime, start=aa_pos, end=aa_pos) <- aa_sub
                        pairPos <- setdiff(which(s_prime %in% s), which(s_prime==s))
                        dt_edges <- data.table::data.table(
                          "AASeq1"=s,
                          "AASeq2"=s_prime,
                          "AA1"=stringr::str_sub(s, start=aa_pos, end=aa_pos),
                          "MutPosition"=aa_pos,
                          "AA2"=aa_sub,
                          "MutPattern"="Substitution"
                        )
                        dt_edges <- dt_edges[pairPos,]
                        dt_edges[,MutType:=paste0(AA1, "_", MutPosition, "_", AA2)][,AA1:=NULL][,MutPosition:=NULL][,AA2:=NULL]
                        return(dt_edges)
                      })
    data.table::rbindlist(dt_edges)
  } %>%
    data.table::rbindlist() %>%
    unique(fromLast=F, by=c("AASeq1","AASeq2","MutPattern"))

  gc();gc()
  cat("Merging...\n")
  dt_neighbor <- rbind(
    dt_neighbor_sub
  )
  dt_neighbor[,MutPattern:=factor(MutPattern, levels=c("Substitution"))]
  data.table::setorder(dt_neighbor, AASeq1, AASeq2, MutPattern, MutType)

  # Construct a neighbor network object
  net_neighbor <- igraph::graph_from_data_frame(dt_neighbor, directed=F)
  net_neighbor <- igraph::set_vertex_attr(net_neighbor, name="label", value=igraph::V(net_neighbor)$name)
  if(is.null(numSet)){
    out <- list("NeighborNetwork"=net_neighbor, "PairDF"=dt_neighbor)
  }else{
    ## Weighted and directed network using a given set of scores
    ## Direction is determined such that the score always increases after mutation
    dt_num <- data.table::data.table("Peptide"=peptideSet, "Score"=numSet)
    dt_neighbor_dw <- data.table::copy(dt_neighbor)
    dt_neighbor_dw <- merge(dt_neighbor_dw, dt_num, by.x="AASeq1", by.y="Peptide")
    dt_neighbor_dw <- merge(dt_neighbor_dw, dt_num, by.x="AASeq2", by.y="Peptide", suffixes=c("1","2"))
    dt_neighbor_dw <- dt_neighbor_dw[Score2>=Score1,]
    dt_neighbor_dw[,ScoreRatio:=Score2/Score1]
    dt_neighbor_dw[,EdgeWeight:=Score1/Score2]
    dt_neighbor_dw[EdgeWeight<0.001, EdgeWeight:=0.001] ### set an arbitrary minimum threshold
    net_neighbor_dw <- igraph::graph_from_data_frame(dt_neighbor_dw, directed=directed)
    net_neighbor_dw <- igraph::set_vertex_attr(net_neighbor_dw, name="label", value=igraph::V(net_neighbor_dw)$name)
    if(weighted==T) igraph::E(net_neighbor_dw)$weight <- igraph::E(net_neighbor_dw)$EdgeWeight
    out <- list(
      "NeighborNetwork"=net_neighbor, "PairDF"=dt_neighbor,
      "NeighborNetwork_DW"=net_neighbor_dw, "PairDF_DW"=dt_neighbor_dw
    )
  }

  # Finish the timer
  time.end <- proc.time()
  message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")

  # Output
  gc();gc()
  return(out)
}

neighborNetwork_Cluster <- function(peptide, graph, metadataDF, seed=12345, plot=T){
  ## Peptide labels
  peptideLabels <- peptide
  peptideSet <- igraph::V(graph)$"name"
  igraph::V(graph)$label[!peptideSet %in% peptideLabels] <- ""

  ## Metadata
  metadataDF <- dplyr::filter(metadataDF, Peptide %in% peptideSet)
  df_meta <- dplyr::select(metadataDF, Peptide, ImmunogenicityScore)
  if("Immunogenicity" %in% colnames(metadataDF)){
    df_meta$Immunogenicity <- metadataDF$Immunogenicity
    igraph::V(graph)$Immunogenicity <- df_meta$Immunogenicity
  }
  igraph::V(graph)$ImmunogenicityScore <- df_meta$ImmunogenicityScore

  ## Clusters
  set.seed(seed)
  df_meta <- dplyr::left_join(
    dplyr::tibble("Peptide"=peptideSet,
                  "Target"=dplyr::if_else(peptideSet==peptide, "Target", "Neighbor"),
                  "ClusterID"=paste0("Cluster", igraph::cluster_walktrap(graph)$membership)),
    df_meta,
    by="Peptide"
  )

  ## No plot ver.
  if(plot!=T) return(df_meta)

  ## Coordinates
  set.seed(seed)
  l <- igraph::layout_nicely(graph)
  df_meta <- dplyr::left_join(
    magrittr::set_colnames(cbind(dplyr::tibble("Peptide"=peptideSet), as.data.frame(l)), c("Peptide","x","y")),
    df_meta,
    by="Peptide"
  )

  ## Consensus per cluster
  clusteredPeptides <- df_meta %>%
    dplyr::arrange(ClusterID) %>%
    dplyr::mutate(ClusterID=as.character(ClusterID)) %>%
    data.table::as.data.table() %>%
    split(by="ClusterID") %>%
    lapply(function(d){d[["Peptide"]]})
  consensusSequence <- function(sequenceSet){
    if(length(sequenceSet)==1) return(sequenceSet)
    sink(tempfile())
    s <- msa::msaConsensusSequence(msa::msaClustalW(sequenceSet, type="protein"), type="Biostrings", ambiguityMap="X", threshold=0.5)
    sink()
    return(s)
  }
  clusterConsensusSeqs <- sapply(clusteredPeptides, consensusSequence)
  clusterConsensusSeqs <- stringr::str_replace_all(clusterConsensusSeqs, stringr::fixed("-"), "X")

  ## Graph plot
  clusterGraphPlot <- function(g, meta, layout, seed=12345){
    ### Vertex annotations
    colPal <- function(x){
      pal <- colorRamp(append(ggsci::pal_d3()(2), "white", after=1), space="rgb")
      cols <- pal(x)
      apply(cols, 1, function(x){rgb(x[1], x[2], x[3], maxColorValue=255)})
    }
    clusterColors <- meta %>%
      dplyr::group_by(ClusterID) %>%
      dplyr::summarise(ImmunogenicityColor=colPal(mean(ImmunogenicityScore)))
    clusterColors <- scales::alpha(clusterColors$"ImmunogenicityColor", alpha=0.75)
    vertexColors <- colPal(meta$"ImmunogenicityScore")
    if(is.null(meta$"Immunogenicity")){
      vertexShapes <- "circle"
    }else{
      vertexShapes <- dplyr::if_else(meta$"Immunogenicity"=="Positive", "circle", "square")
    }
    vertexLabels <- igraph::V(g)$"label"

    ### Cluster labels
    clusterCentroids <- meta %>%
      dplyr::group_by(ClusterID) %>%
      dplyr::summarise(x=mean(x), y=mean(y))
    clusterCentroids$x <- scales::rescale(clusterCentroids$x, to=c(-1, 1))
    clusterCentroids$y <- scales::rescale(clusterCentroids$y, to=c(-1, 1))

    ### Graph
    try(dev.off(), silent=T)
    set.seed(seed)
    plot(
      igraph::cluster_walktrap(g), g,
      layout=layout,
      col=vertexColors,
      mark.border="black",
      mark.col=clusterColors,
      vertex.size=3,
      vertex.shape=vertexShapes,
      vertex.label=vertexLabels,
      vertex.label.color="black",
      vertex.label.cex=1.25,
      vertex.label.dist=0.5,
      vertex.label.family="sans",
      vertex.color=vertexColors,
      vertex.frame.color="black",
      edge.width=0.5,
      edge.arrow.size=0.25,
      edge.arrow.width=0.1,
      edge.color=scales::alpha("gray50", 0.75)
    )
    for(i in 1:length(clusterCentroids$ClusterID)){
      text(
        rep(clusterCentroids$x[[i]], 2),
        clusterCentroids$y[[i]]+c(0, -0.1),
        c(clusterCentroids$ClusterID[[i]], clusterConsensusSeqs[[i]]),
        pos=3, cex=1.25, family="sans"
      )
    }
    return(recordPlot())
  }
  neighborPlot <- clusterGraphPlot(
    g=graph,
    meta=df_meta,
    layout=l,
    seed=seed
  )
  return(list(
    "SummaryDF"=df_meta,
    "NeighborPlot"=neighborPlot
  ))
}

#' @export
#' @rdname NeighborNetwork_Clustering
#' @name NeighborNetwork_Clustering
neighborNetwork_Cluster_Batch <- function(neighborNetResult, metadataDF, seed=12345, coreN=parallel::detectCores(logical=F),origPeptide=PEPTIDE){
  ## Extract peptide sequences
  peptideSet <- igraph::V(neighborNetResult$"NeighborNetwork_DW")$"name"

  #orig=origPeptide

  ## Cluster analysis
  message("Clustering neighbor network...")

  res <- foreach::foreach(pept=peptideSet, .inorder=F)%do%{
    graph <- Repitope::neighborNetwork_ConnectedSubGraph(neighborNetResult, pept)
    df_meta <- neighborNetwork_Cluster(pept, graph, metadataDF, seed)
    pos <- grep("Target", df_meta$"Target")
    clust <- df_meta$"ClusterID"[[pos]]
    score <- df_meta$"ImmunogenicityScore"[[pos]]
    list("Peptide"=pept, "Score"=score, "ClusterID"=clust, "SummaryDF"=df_meta)
  }
  gc();gc()
  return(res)
}

neighborNetwork_Cluster_FeatureDF <- function(neighborNetClusterResult, coreN=parallel::detectCores(logical=F)){
  message("Computing cluster-based metrics...")

  dt_feat <- foreach::foreach(i=1:length(neighborNetClusterResult), .inorder=F, .packages=c("dplyr","data.table"))%do%{
    summaryDF <- neighborNetClusterResult[[i]][["SummaryDF"]]$SummaryDF
    summaryDF <- dplyr::arrange(summaryDF, dplyr::desc(Target))
    dt <- data.table::as.data.table(summaryDF[1,])
    aveDF <- summaryDF %>%
      dplyr::group_by(ClusterID) %>%
      dplyr::summarise(ImmunogenicityScore=mean(ImmunogenicityScore))
    dt[,ImmunogenicityScore_Cluster_Average:=dplyr::filter(aveDF, ClusterID==dt$"ClusterID"[1])$"ImmunogenicityScore"]
    aveDF <- dplyr::filter(aveDF, ClusterID!=dt$"ClusterID"[1])
    dt[,ImmunogenicityScore_Cluster_Diff_Max:=max(aveDF$"ImmunogenicityScore") - ImmunogenicityScore_Cluster_Average]
    dt[,ImmunogenicityScore_Cluster_Diff_Min:=ImmunogenicityScore_Cluster_Average - min(aveDF$"ImmunogenicityScore")]  ## Considered to be an "escape potential"
    dt[,ImmunogenicityScore_Diff_Max:=max(summaryDF$"ImmunogenicityScore") - ImmunogenicityScore]
    dt[,ImmunogenicityScore_Diff_Min:=ImmunogenicityScore - min(summaryDF$"ImmunogenicityScore")]
    dt
  } %>%
    data.table::rbindlist()
  gc();gc()
  return(dt_feat)
}

neighborNetwork_Cluster <- function(peptide, graph, metadataDF, seed=12345, plot=T){
  ## Peptide labels
  peptideLabels <- peptide
  peptideSet <- igraph::V(graph)$"name"
  igraph::V(graph)$label[!peptideSet %in% peptideLabels] <- ""

  ## Metadata
  metadataDF <- dplyr::filter(metadataDF, Peptide %in% peptideSet)
  df_meta <- dplyr::select(metadataDF, Peptide, ImmunogenicityScore)
  if("Immunogenicity" %in% colnames(metadataDF)){
    df_meta$Immunogenicity <- metadataDF$Immunogenicity
    igraph::V(graph)$Immunogenicity <- df_meta$Immunogenicity
  }
  igraph::V(graph)$ImmunogenicityScore <- df_meta$ImmunogenicityScore

  ## Clusters
  set.seed(seed)
  df_meta <- dplyr::left_join(
    dplyr::tibble("Peptide"=peptideSet,
                  "Target"=dplyr::if_else(peptideSet==peptide, "Target", "Neighbor"),
                  "ClusterID"=paste0("Cluster", igraph::cluster_walktrap(graph)$membership)),
    df_meta,
    by="Peptide"
  )

  ## No plot ver.
  if(plot!=T) return(df_meta)

  ## Coordinates
  set.seed(seed)
  l <- igraph::layout_nicely(graph)
  df_meta <- dplyr::left_join(
    magrittr::set_colnames(cbind(dplyr::tibble("Peptide"=peptideSet), as.data.frame(l)), c("Peptide","x","y")),
    df_meta,
    by="Peptide"
  )

  ## Consensus per cluster
  clusteredPeptides <- df_meta %>%
    dplyr::arrange(ClusterID) %>%
    dplyr::mutate(ClusterID=as.character(ClusterID)) %>%
    data.table::as.data.table() %>%
    split(by="ClusterID") %>%
    lapply(function(d){d[["Peptide"]]})
  consensusSequence <- function(sequenceSet){
    if(length(sequenceSet)==1) return(sequenceSet)
    sink(tempfile())
    s <- msa::msaConsensusSequence(msa::msaClustalW(sequenceSet, type="protein"), type="Biostrings", ambiguityMap="X", threshold=0.5)
    sink()
    return(s)
  }
  clusterConsensusSeqs <- sapply(clusteredPeptides, consensusSequence)
  clusterConsensusSeqs <- stringr::str_replace_all(clusterConsensusSeqs, stringr::fixed("-"), "X")

  ## Graph plot
  clusterGraphPlot <- function(g, meta, layout, seed=12345){
    ### Vertex annotations
    colPal <- function(x){
      pal <- colorRamp(append(ggsci::pal_d3()(2), "white", after=1), space="rgb")
      cols <- pal(x)
      apply(cols, 1, function(x){rgb(x[1], x[2], x[3], maxColorValue=255)})
    }
    clusterColors <- meta %>%
      dplyr::group_by(ClusterID) %>%
      dplyr::summarise(ImmunogenicityColor=colPal(mean(ImmunogenicityScore)))
    clusterColors <- scales::alpha(clusterColors$"ImmunogenicityColor", alpha=0.75)
    vertexColors <- colPal(meta$"ImmunogenicityScore")
    if(is.null(meta$"Immunogenicity")){
      vertexShapes <- "circle"
    }else{
      vertexShapes <- dplyr::if_else(meta$"Immunogenicity"=="Positive", "circle", "square")
    }
    vertexLabels <- igraph::V(g)$"label"

    ### Cluster labels
    clusterCentroids <- meta %>%
      dplyr::group_by(ClusterID) %>%
      dplyr::summarise(x=mean(x), y=mean(y))
    clusterCentroids$x <- scales::rescale(clusterCentroids$x, to=c(-1, 1))
    clusterCentroids$y <- scales::rescale(clusterCentroids$y, to=c(-1, 1))

    ### Graph
    try(dev.off(), silent=T)
    set.seed(seed)
    plot(
      igraph::cluster_walktrap(g), g,
      layout=layout,
      col=vertexColors,
      mark.border="black",
      mark.col=clusterColors,
      vertex.size=3,
      vertex.shape=vertexShapes,
      vertex.label=vertexLabels,
      vertex.label.color="black",
      vertex.label.cex=1.25,
      vertex.label.dist=0.5,
      vertex.label.family="sans",
      vertex.color=vertexColors,
      vertex.frame.color="black",
      edge.width=0.5,
      edge.arrow.size=0.25,
      edge.arrow.width=0.1,
      edge.color=scales::alpha("gray50", 0.75)
    )
    for(i in 1:length(clusterCentroids$ClusterID)){
      text(
        rep(clusterCentroids$x[[i]], 2),
        clusterCentroids$y[[i]]+c(0, -0.1),
        c(clusterCentroids$ClusterID[[i]], clusterConsensusSeqs[[i]]),
        pos=3, cex=1.25, family="sans"
      )
    }
    return(recordPlot())
  }
  neighborPlot <- clusterGraphPlot(
    g=graph,
    meta=df_meta,
    layout=l,
    seed=seed
  )
  return(list(
    "SummaryDF"=df_meta,
    "NeighborPlot"=neighborPlot
  ))
}

6 Run in silico mutagenesis analysis

6.1 Create datasets for visualisation

  • Creates Neighbor network datasets.
FILEPATH = "NEIGHBOR_NETWORK_ANALYSIS_TRAPP/"
dir.create(FILEPATH)

# For each unique WT wuhan peptide.
for(i in 1:length(unique(FULL_PREDICTIONS_DT$Peptide))){
  PEPTIDE = unique(FULL_PREDICTIONS_DT$Peptide)[i]
  DATASET = FULL_PREDICTIONS_DT %>% filter(Peptide%in% PEPTIDE)
  # Find the Wuhan scores across different pMHC
  WUHAN_SCORE_DISTR = DATASET  %>% filter(Binder == "BINDER") %>% select(Peptide, Predicted_Binding, WuhanScore) %>% distinct() %>% mutate(Dataset="Wuhan") %>% dplyr::rename(Prediction=WuhanScore)
  # Find the mutants
  MT_SCORE_DIST = DATASET %>% select(AASeq2, Predicted_Binding, Prediction) %>% distinct() %>% mutate(Dataset="MT")%>% dplyr::rename(Peptide=AASeq2)
  # For the original score across pMHC, take the mean
  ORIG_SCORE=DATASET %>% filter(Binder == "BINDER") %>% select(Peptide, Predicted_Binding, WuhanScore) %>% distinct() %>% group_by(Peptide) %>% dplyr::summarise(ImmunogenicityScore=mean(WuhanScore))%>% ungroup()
  # Average across MHC for each WT-MT unique sample
  DATASET=DATASET%>% group_by(Peptide, AASeq2) %>% dplyr::summarise(ImmunogenicityScore=mean(Prediction))

  # Generate the neighbour network
  DATASET_NEIGHBOUR = DATASET %>% ungroup%>% select(!Peptide)%>% dplyr::rename(Peptide=AASeq2)
  DATASET_NEIGHBOUR=DATASET_NEIGHBOUR %>% as.data.table
  DATASET_NEIGHBOUR=ORIG_SCORE %>% rbind(DATASET_NEIGHBOUR)%>% as.data.table

  nnet_ISM <- neighborNetwork(DATASET_NEIGHBOUR$Peptide, DATASET_NEIGHBOUR$ImmunogenicityScore)
  PEPTIDE_FILEPATH= paste0(FILEPATH,"/",PEPTIDE,"/")
  dir.create(PEPTIDE_FILEPATH)
  saveRDS(nnet_ISM, file=paste0(PEPTIDE_FILEPATH,"/NeighborNetwork.rds"))

  # Show the density of scores
  DENSITY_PLT=WUHAN_SCORE_DISTR %>% rbind(MT_SCORE_DIST)%>%
          ggdensity(x="Prediction",y="..density..",fill="Dataset")
  ggsave(filename = paste0(PEPTIDE_FILEPATH,"/Density.pdf"))
  seed=1234
  # Create the cluster for neighbor network. As this is substitution based, this is clustered simply on position of mutation
  nnet_ISM_cluster <- neighborNetwork_Cluster_Batch(nnet_ISM, DATASET_NEIGHBOUR[,.(Peptide,ImmunogenicityScore)],seed=seed)
  # Create the featureDF
  nnet_ISM_cluster_featureDF <- neighborNetwork_Cluster_FeatureDF(nnet_ISM_cluster, coreN=5)
  nnet_ISM_cluster_featureDF=nnet_ISM_cluster_featureDF %>% mutate(Peptide_Type = ifelse(Peptide == PEPTIDE, "Original","InSilicoMutated"))

  saveRDS(nnet_ISM_cluster, file.path(PEPTIDE_FILEPATH, "/NeighborNetwork_Cluster.rds"))
  readr::write_csv(nnet_ISM_cluster_featureDF, file.path(PEPTIDE_FILEPATH, "NeighborNetwork_Cluster_FeatureDF.csv"))
  gc();gc()
  # Create Ogishi's summary DT. We don't use this really as it is messy.
  summaryDT <- merge(DATASET_NEIGHBOUR[Peptide %in% PEPTIDE,]%>% mutate(Peptide_Type="Original"), nnet_ISM_cluster_featureDF[Peptide %in% PEPTIDE,], by=c("Peptide","Peptide_Type","ImmunogenicityScore"))
  summaryDT[,EscapePotential:=ImmunogenicityScore_Cluster_Diff_Min]
  summaryDT <- summaryDT[Peptide_Type=="Original", .(Peptide, ImmunogenicityScore, ImmunogenicityScore_Cluster_Average, EscapePotential)]
  data.table::setorder(summaryDT, -ImmunogenicityScore, -ImmunogenicityScore_Cluster_Average, EscapePotential)
  readr::write_csv(summaryDT, file.path(PEPTIDE_FILEPATH, "EpitopePrioritizationSummary.csv"))

  }
neighborNetwork_ConnectedSubGraph <- function(neighborNetResult, peptide){
  connectedPeptides <- c(
    peptide,
    dplyr::filter(neighborNetResult$"PairDF_DW", AASeq1==peptide)$"AASeq2",
    dplyr::filter(neighborNetResult$"PairDF_DW", AASeq2==peptide)$"AASeq1"
  )
  connectedSubgraph <- igraph::induced_subgraph(
    neighborNetResult$"NeighborNetwork_DW",
    which(igraph::V(neighborNetResult$"NeighborNetwork_DW")$label %in% connectedPeptides)
  )
  return(connectedSubgraph)
}
# Read in the summary dataset. This is generated by Ogishi and uses his 'escape potential' metric
# I dont like this metric so i will use avg log ratios for each WT across all mutants
list_of_files = list.files(path="NEIGHBOR_NETWORK_ANALYSIS_TRAPP/",
                           recursive = T,
                           pattern = "EpitopePrioritizationSummary.csv",
                           full.names = T)

EPITOPE_PRIORITIZATION_DT = list_of_files %>% purrr::map_df(fread)
  • Variation in the escape potential of the single amino acid mutation eps in Omicron/
  • AQG* most escape prone?

EPITOPE_PRIORITIZATION_DT %>% ggplot(aes(x=reorder(Peptide, -EscapePotential), y=EscapePotential))+geom_bar(stat = "identity",fill="darkgrey")+theme_pubr()+
  rotate_x_text()+xlab("Peptide")+ geom_hline(yintercept = 0.1, linetype="dashed",color="white")+ geom_hline(yintercept = 0.2, linetype="dashed",color="white")
# Link information about each simulated mutant with the corresponding WT-MT Prediction scores, given TRAP and/or imputation
MUTATIONDATA = readRDS("INSILICO_MUTAGENESIS_WUHANOMICRON_SUB_MUTATIONDATA.rds")%>% dplyr::rename(AASeq2=VariantAlignment)

FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT%>% left_join(MUTATIONDATA)
## Joining, by = c("Peptide", "AASeq2")
#saveRDS(FULL_PREDICTIONS_DT, file="INSILICO_MUTAGENESIS_COMPILED_PREDICTIONS_SUBONLY_w_MUTATIONDATA.rds")

7 Considering all mutations, which are the most impactful?

# Create logs ratios for each sample.
# Poorly named: Prediction = Omicron overall immunogenicity score.
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(logsOdds = log(Prediction/WuhanScore))
FULL_PREDICTIONS_DT = FULL_PREDICTIONS_DT %>% mutate(Length=nchar(Peptide))
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
    library(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
      .fun = function(xx, col) {
        c(N    = length2(xx[[col]], na.rm=na.rm),
          mean = mean   (xx[[col]], na.rm=na.rm),
          sd   = sd     (xx[[col]], na.rm=na.rm)
        )
      },
      measurevar
    )

    # Rename the "mean" column
    datac <- rename(datac, c("mean" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval:
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

7.1 9MERS: ANCHOR

  • Due to the potential for HLA bias confounding the results for anchor positions, we analysed the effects of mutations in anchor and TCR contact positions separately
  • Look at entire mutation i.e A_X_B, removing amino acd ‘A’ from the wildtype (ChangeFrom), in position X (SeqMutPos), replacing with ‘B’ (ChangeTo).
  • Anchor positions not used in manuscript
LENGTH=9
POSITIONS = c(1,2,9)
# What are the mutations which on average cause biggest increases in immunogenicity? Take top 10%
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% # Filter anchor pos
        filter(Length == LENGTH)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>% # filter length. Define mutation
        group_by(Mutation)%>% dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% # For each A_X_B mutation, take the mean logs odds.
        arrange(desc(meanlogsOdds))%>% filter(! n<20)%>% # Exclude any samples with n<20
        slice_max(order_by = meanlogsOdds, prop = 0.10) # Take top 10%

# What are the mutations which on average cause biggest decreases in immunogenicity?Take bottom 10%
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>%
        filter(Length == LENGTH)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>%
        group_by(Mutation)%>% dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>%
        arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.10)

# Create vector of this combined set of mutations of interest
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)

# Clean and curate the data of top/bottom 10%. Filter for positions of interest and appropriate peptide lengths. Get rid of SeqMutPos==0 if exists which is generated in old cases for earlier stat comparison.
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>%
        filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

# Create the summary statistics: what is mean logs odds for each mutation and SD/SE.
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("Mutation"))
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
## 
##     compact
## The following object is masked from 'package:ggpubr':
## 
##     mutate
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
# Plot
MUTS_DT%>%
  ggplot(aes(x=reorder(Mutation, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")

7.2 Change from

  • Look at the logs odds ratios, grouped by the ChangeFrom, i.e the amino acid removed from the widltype.
    • not used in manus. EDA only.
## change from
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
  dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeFromPos)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>%
        group_by(ChangeFromPos)%>% filter(ChangeFromPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFromPos"))

MUTS_DT%>%
  ggplot(aes(x=reorder(ChangeFromPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")

7.3 Change to

  • Replacing amino acif in the mutant
  • EDA only.
## change to
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.30)

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>%
        filter(Length == LENGTH)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.3)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))

MUTS_DT%>%
  ggplot(aes(x=reorder(ChangeToPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")

 # Color scheme based on chemistry of amino acids
# Nicked from ggseqlogo
         chemistry = data.frame(
           letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'),
           group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)),
           col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)),
           stringsAsFactors = F
         )

8 9MERS: TCR CONTACT

LENGTH=9
POSITIONS = c(3,4,5,6,7,8)
# Filter for TCR contact pos in 9mers
# First, only look at positional effects
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)
# Analyse
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("SeqMutationPos"))
# Plot logs odds
MUTS_DT%>%
  ggplot(aes(x=reorder(SeqMutationPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Sequence Position")+ylab("Logs Odds ratio (MT/WT)")

# color by chenistry
colors = unique(chemistry$col) ; names(colors) = unique(chemistry$group)

#Plot for ChangeFrom amino acid - 9mers.
# Now, look at 'ChangeFrom' amino acid.
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFrom"))
CF_MUTS_DT=MUTS_DT %>% mutate(letter = ChangeFrom) %>% inner_join(chemistry) %>% dplyr::rename(logsOdds_removal=logsOdds)
## Joining, by = "letter"
NINE_CONTACT_FROM_PLT=MUTS_DT %>% mutate(letter = ChangeFrom) %>% inner_join(chemistry)%>%
  ggplot(aes(x=reorder(ChangeFrom, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Amino acid (wildtype removal))")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
NINE_CONTACT_FROM_PLT

saveRDS(NINE_CONTACT_FROM_PLT,file="NINE_CONTACT_FROM_PLT.rds")

#Plot for ChangeTo amino acid - 9mers.
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeTo"))
CT_MUTS_DT=MUTS_DT %>% mutate(letter = ChangeTo) %>% inner_join(chemistry) %>% dplyr::rename(logsOdds_replacement=logsOdds)
## Joining, by = "letter"
NINE_CONTACT_TO_PLT=MUTS_DT %>% mutate(letter = ChangeTo) %>% inner_join(chemistry)%>%
  ggplot(aes(x=reorder(ChangeTo, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Amino acid (mutant replacement)")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
NINE_CONTACT_TO_PLT

saveRDS(NINE_CONTACT_TO_PLT,file="NINE_CONTACT_TO_PLT.rds")
#Supplementary scatter plot
CF_MUTS_DT=CF_MUTS_DT %>% select(letter, group, col, logsOdds_removal)
CT_MUTS_DT=CT_MUTS_DT %>% select(letter, group, col, logsOdds_replacement)

NINE_CONTACT_SCATTER_PLT=CF_MUTS_DT %>% inner_join(CT_MUTS_DT) %>%
    ggplot(aes(x=logsOdds_removal, y=logsOdds_replacement, color=group))+geom_point()+
    ggrepel::geom_text_repel(aes(label = letter,  color = group), size = 3)+theme_pubr(base_size = 16)+scale_color_manual(values=colors)+
        ylab("Logs Odds Ratio (Replacement)")+xlab("Logs Odds Ratio (Removal)")+labs(color="chemistry")+guides(color=guide_legend(nrow=2,byrow=TRUE))
## Joining, by = c("letter", "group", "col")
saveRDS(NINE_CONTACT_SCATTER_PLT,file="NINE_CONTACT_SCATTER_PLT.rds")

8.1 Now analyse entire mutation of form A_X_B.

# Top 5% of 9mer contact position mutations. I.e inducing most immunogenic changes.
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>% # Filter 9mer contact
        mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%  # Create mutation of form A_X_B
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>% # Summarise and !n<20
        slice_max(order_by = meanlogsOdds, prop = 0.05) # Top 5%

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.05) # min 5%

# Bind mutations
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)

# Create the dataset based on above mutations and filter settings.
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>%
        filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("Mutation"))
# Create plot: NINEMER CONTACT, FULL MUTATION.

NINE_CONTACT_TO_SPECIFICMUT_PLT=MUTS_DT%>% mutate(ChangeFrom = stringr::str_extract(Mutation,"[A-Z]\\_"))%>%
        mutate(ChangeFrom = gsub("\\_", "", ChangeFrom))%>% mutate(letter = ChangeFrom) %>%
        inner_join(chemistry)%>%
        ggplot(aes(x=reorder(Mutation, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
        geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values = colors)
## Joining, by = "letter"
NINE_CONTACT_TO_SPECIFICMUT_PLT

saveRDS(NINE_CONTACT_TO_SPECIFICMUT_PLT,file="NINE_CONTACT_TO_SPECIFICMUT_PLT.rds")

8.2 change from including position

  • Look at 9mers ChangeFrom amino acid including position
  • Take 30% for 9mer contact.

MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3) # Take top 30% of 'ChangeFrom' amino acids for 9mers in contact position

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.3) # Take bottom 30% of 'ChangeFrom' amino acid changes for 9mers in contact pos.

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeFromPos)
# Create DT.
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        filter(ChangeFromPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFromPos"))
# Plot.
MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeFromPos,"[A-Z]"))%>% inner_join(chemistry)%>%
  ggplot(aes(x=reorder(ChangeFromPos, -logsOdds), y=logsOdds, fill=group))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)

8.3 change to including position

  • TCR contact positions 9mer. ChangeTo including position.
  • 30%

MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3) # Take top 30%.

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.3)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))

MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeToPos,"[A-Z]"))%>% inner_join(chemistry)%>%
  ggplot(aes(x=reorder(ChangeToPos, -logsOdds), y=logsOdds, fill=group))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)

9 10MERS: ANCHOR

LENGTH=10
POSITIONS = c(1,2,10)

MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.05)

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.05)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>%
        filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>%
        filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("Mutation"))

MUTS_DT%>%
  ggplot(aes(x=reorder(Mutation, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")

9.1 change from

MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3)

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.3)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeFromPos)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        filter(ChangeFromPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFromPos"))

MUTS_DT%>%
  ggplot(aes(x=reorder(ChangeFromPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")

9.2 change to


MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3)

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.3)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))

MUTS_DT%>%
  ggplot(aes(x=reorder(ChangeToPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")

10 10MERS: TCR CONTACT

LENGTH=10
POSITIONS = c(3,4,5,6,7,8,9)
# Analyse positional impact
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("SeqMutationPos"))
MUTS_DT%>%
  ggplot(aes(x=reorder(SeqMutationPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Sequence Position")+ylab("Logs Odds ratio (MT/WT)")

# Analyse the effect simply of the 'ChangeFrom' amino acid
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFrom"))
CF_MUTS_DT=MUTS_DT %>% mutate(letter = ChangeFrom) %>% inner_join(chemistry) %>% dplyr::rename(logsOdds_removal=logsOdds)
## Joining, by = "letter"
TEN_CONTACT_FROM_PLT=MUTS_DT %>% mutate(letter = ChangeFrom) %>% inner_join(chemistry)%>%
  ggplot(aes(x=reorder(ChangeFrom, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Amino acid (wildtype removal)")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
TEN_CONTACT_FROM_PLT

saveRDS(TEN_CONTACT_FROM_PLT,file="TEN_CONTACT_FROM_PLT.rds")
# Analyse the effect of the 'ChangeTo' amino acid
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeTo"))
CT_MUTS_DT=MUTS_DT %>% mutate(letter = ChangeTo) %>% inner_join(chemistry) %>% dplyr::rename(logsOdds_replacement=logsOdds)
## Joining, by = "letter"
TEN_CONTACT_TO_PLT=MUTS_DT %>% mutate(letter = ChangeTo) %>% inner_join(chemistry)%>%
  ggplot(aes(x=reorder(ChangeTo, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Amino acid (mutant replacement)")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
TEN_CONTACT_TO_PLT

saveRDS(TEN_CONTACT_TO_PLT,file="TEN_CONTACT_TO_PLT.rds")
#Supplementary scatter plot
CF_MUTS_DT=CF_MUTS_DT %>% select(letter, group, col, logsOdds_removal)
CT_MUTS_DT=CT_MUTS_DT %>% select(letter, group, col, logsOdds_replacement)

TEN_CONTACT_SCATTER_PLT=CF_MUTS_DT %>% inner_join(CT_MUTS_DT) %>%
    ggplot(aes(x=logsOdds_removal, y=logsOdds_replacement, color=group))+geom_point()+
    ggrepel::geom_text_repel(aes(label = letter,  color = group), size = 3)+theme_pubr(base_size = 16)+scale_color_manual(values=colors)+
        ylab("Logs Odds Ratio (Replacement)")+xlab("Logs Odds Ratio (Removal)")+labs(color="chemistry")+guides(color=guide_legend(nrow=2,byrow=TRUE))
## Joining, by = c("letter", "group", "col")
TEN_CONTACT_SCATTER_PLT

saveRDS(TEN_CONTACT_SCATTER_PLT,file="TEN_CONTACT_SCATTER_PLT.rds")

10.1 Full mutation

-A_X_B mutation form across TCR contact positionsof 10mers.

# Top 10% of mutations as described previously.
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.10)
# Bottom 10%
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.10)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>%
        filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("Mutation"))

MUTS_DT%>% mutate(ChangeFrom = stringr::str_extract(Mutation,"[A-Z]\\_"))%>%
        mutate(ChangeFrom = gsub("\\_", "", ChangeFrom))%>% mutate(letter = ChangeFrom) %>% inner_join(chemistry)%>%
        ggplot(aes(x=reorder(Mutation, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
        geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values = colors)
## Joining, by = "letter"

10.2 change from including position

  • TCR contact positions 10mer. ChangeFrom including position.
  • 30%
# Top 30%
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3)
# Bottom 30%
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.3)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeFromPos)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_",  SeqMutationPos))%>% group_by(ChangeFromPos)%>%
        filter(ChangeFromPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFromPos"))

MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeFromPos,"[A-Z]"))%>% inner_join(chemistry)%>%
  ggplot(aes(x=reorder(ChangeFromPos, -logsOdds), y=logsOdds, fill=group))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"

10.3 change to including position

  • TCR contact positions 10mer. ChangeTo including position.
  • 30%
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3)

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_min(order_by = meanlogsOdds, prop = 0.3)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
        filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))

TEN_CONTACT_TO_AA_POS_PLT=MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeToPos,"[A-Z]"))%>%
        inner_join(chemistry)%>%
        ggplot(aes(x=reorder(ChangeToPos, -logsOdds), y=logsOdds, fill=group))+geom_bar(stat = "identity")+
        geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
TEN_CONTACT_TO_AA_POS_PLT

saveRDS(TEN_CONTACT_TO_AA_POS_PLT,file="TEN_CONTACT_TO_AA_POS_PLT.rds")

11 Positional supplementary analysis.

PROP=0.30
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
  dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
  slice_max(order_by = meanlogsOdds, prop = PROP)

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
  dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
  slice_min(order_by = meanlogsOdds, prop = PROP)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)

MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
  filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))

MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))


DT_FOR_BIAS_ANALYSIS=MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeToPos,"[A-Z]"))%>% inner_join(chemistry)%>%arrange(desc(logsOdds))%>%
  ungroup() %>% dplyr::mutate(ID=dplyr::row_number())%>% mutate(Position=readr::parse_number(ChangeToPos))

DT_FOR_BIAS_ANALYSIS_OUT = foreach::foreach(i=1:(nrow(DT_FOR_BIAS_ANALYSIS)-4), .combine = "rbind")%do% {
  DT_FOR_BIAS_ANALYSIS %>% filter(ID %in% seq(i,(i+4)))%>% group_by(Position) %>% dplyr::summarise(n=n())%>% mutate(ID=i)
}

DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
        group_by(ID)%>% mutate(Freq=n/sum(n))%>%
  ggline(x="ID",y="n",color="Position")

DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% ungroup()%>%mutate(Position=as.character(Position))%>%
        group_by(ID)%>% mutate(Freq=n/sum(n))%>%
  ggline(x="ID",y="n",color="Position")

POSITION=3
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
        filter(Position==POSITION)%>%
        group_by(ID)%>% mutate(Freq=n/sum(n))%>%
  ggline(x="ID",y="n",color="Position")#+ylim(0,5)


POSITION=4
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
        filter(Position==POSITION)%>%
        group_by(ID)%>% mutate(Freq=n/sum(n))%>%
  ggline(x="ID",y="n",color="Position")#+ylim(0,5)



POSITION=5
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
        filter(Position==POSITION)%>%
        group_by(ID)%>% mutate(Freq=n/sum(n))%>%
  ggline(x="ID",y="n",color="Position")#+ylim(0,5)



POSITION=6
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
        filter(Position==POSITION)%>%
        group_by(ID)%>% mutate(Freq=n/sum(n))%>%
  ggline(x="ID",y="n",color="Position")#+ylim(0,5)



POSITION=7
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
        filter(Position==POSITION)%>%
        group_by(ID)%>% mutate(Freq=n/sum(n))%>%
  ggline(x="ID",y="n",color="Position")#+ylim(0,5)

POSITION=8
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
        filter(Position==POSITION)%>%
        group_by(ID)%>% mutate(Freq=n/sum(n))%>%
  ggline(x="ID",y="n",color="Position")#+ylim(0,5)

POSITION=9
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
        filter(Position==POSITION)%>%
        group_by(ID)%>% mutate(Freq=n/sum(n))%>%
  ggline(x="ID",y="n",color="Position")#+ylim(0,5)


11.1 OLD EDA, Ignore



LENGTH=9

MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
  dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<50)%>%
  slice_max(order_by = meanlogsOdds, prop = 0.1)

LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
  dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<50)%>%
  slice_min(order_by = meanlogsOdds, prop = 0.1)

MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)


FULL_PREDICTIONS_DT  %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>%
  filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))%>%
  mutate(OmicronAASEQ2 = ifelse(AASeq2 %in% MUTATIONS$VariantAlignment, TRUE, FALSE))%>%
  ggplot(aes(x=reorder(Mutation,-logsOdds),y=logsOdds,color=OmicronAASEQ2))+geom_point(aes(alpha=OmicronAASEQ2))+theme_pubr()+rotate_x_text()+geom_hline(yintercept = 1.0,linetype="dashed")+
  scale_alpha_manual(values=c(0.2,1))+
  scale_x_discrete(guide = guide_axis(check.overlap = TRUE))+xlab("Peptide")+ylab("Logs Odds (Mutant/Wild-type)")


FULL_PREDICTIONS_DT  %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_",  SeqMutationPos,"_",ChangeTo))%>%
  filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))%>%
  mutate(OmicronAASEQ2 = ifelse(AASeq2 %in% MUTATIONS$VariantAlignment, TRUE, FALSE)) %>% filter(OmicronAASEQ2==TRUE)%>% group_by(Mutation, Peptide) %>% dplyr::summarise(meanLogs = mean(logsOdds))

12 Run Analysis

```{,dpi=300, fig.width = 10, fig.height = 10} seed=1234 FILEPATH = “NEIGHBOR_NETWORK_ANALYSIS_TRAPP/” for(i in 1:length(unique(FULL_PREDICTIONS_DT\(Peptide))){ # Grab required data PEPTIDE = unique(FULL_PREDICTIONS_DT\)Peptide)[i] DATASET = FULL_PREDICTIONS_DT %>% filter(Peptide%in% PEPTIDE) DATASET=DATASET%>% group_by(Peptide, AASeq2) %>% dplyr::summarise(ImmunogenicityScore=mean(Prediction))

DATASET_NEIGHBOUR = DATASET %>% ungroup%>% select(!Peptide)%>% dplyr::rename(Peptide=AASeq2) DATASET_NEIGHBOUR=DATASET_NEIGHBOUR %>% as.data.table SCORES=DATASET_NEIGHBOUR %>% mutate(Peptide_Type = ifelse(Peptide == PEPTIDE, “Original”,“InSilicoMutated”)) # Read in NNetwork NEIGHBORNETWORK = readRDS(paste0(FILEPATH,“/”,PEPTIDE,“/NeighborNetwork.rds”)) # Read in metadata metadataDF = fread(paste0(FILEPATH,“/”,PEPTIDE,“/NeighborNetwork_Cluster_FeatureDF.csv”)) peptide=PEPTIDE # Below is ogishi code to create visualisation graph=neighborNetwork_ConnectedSubGraph(NEIGHBORNETWORK, peptide) ## Peptide labels peptideLabels <- peptide peptideSet <- igraph::V(graph)\("name" igraph::V(graph)\)label[!peptideSet %in% peptideLabels] <- “”

## Metadata metadataDF <- dplyr::filter(metadataDF, Peptide %in% peptideSet) df_meta <- dplyr::select(metadataDF, Peptide, ImmunogenicityScore) if(“Immunogenicity” %in% colnames(metadataDF)){ df_meta\(Immunogenicity <- metadataDF\)Immunogenicity igraph::V(graph)\(Immunogenicity <- df_meta\)Immunogenicity } igraph::V(graph)\(ImmunogenicityScore <- df_meta\)ImmunogenicityScore

set.seed(seed) df_meta <- dplyr::left_join( dplyr::tibble(“Peptide”=peptideSet, “Target”=dplyr::if_else(peptideSet==peptide, “Target”, “Neighbor”), “ClusterID”=paste0(“Cluster”, igraph::cluster_walktrap(graph)$membership)), df_meta, by=“Peptide” ) ## Coordinates set.seed(seed) l <- igraph::layout_nicely(graph) df_meta <- dplyr::left_join( magrittr::set_colnames(cbind(dplyr::tibble(“Peptide”=peptideSet), as.data.frame(l)), c(“Peptide”,“x”,“y”)), df_meta, by=“Peptide” )

## Consensus per cluster (these are output by Repitope!). We have more clusters for this peptide than reported in Ogishi et al. SAme insight however so maybe he reduced it for complexity clusteredPeptides <- df_meta %>% dplyr::arrange(ClusterID) %>% dplyr::mutate(ClusterID=as.character(ClusterID)) %>% data.table::as.data.table() %>% split(by=“ClusterID”) %>% lapply(function(d){d[[“Peptide”]]}) consensusSequence <- function(sequenceSet){ if(length(sequenceSet)==1) return(sequenceSet) sink(tempfile()) s <- msa::msaConsensusSequence(msa::msaClustalW(sequenceSet, type=“protein”), type=“Biostrings”, ambiguityMap=“X”, threshold=0.5) sink() return(s) } clusterConsensusSeqs <- sapply(clusteredPeptides, consensusSequence) clusterConsensusSeqs <- stringr::str_replace_all(clusterConsensusSeqs, stringr::fixed(“-”), “X”)

## Graph plot ### Vertex annotations colPal <- function(x){ pal <- colorRamp(append(ggsci::pal_d3()(2), “white”, after=1), space=“rgb”) cols <- pal(x) apply(cols, 1, function(x){rgb(x[1], x[2], x[3], maxColorValue=255)}) } layout=l g=graph

clusterColors <- df_meta %>% dplyr::group_by(ClusterID) %>% dplyr::summarise(ImmunogenicityColor=colPal(mean(ImmunogenicityScore))) clusterColors=clusterColors %>% mutate(ID=readr::parse_number(ClusterID))%>% arrange((ID)) %>% select(!ID)

clusterColors <- scales::alpha(clusterColors$"ImmunogenicityColor", alpha=0.75)
vertexColors <- colPal(df_meta$"ImmunogenicityScore")
if(is.null(df_meta$"Immunogenicity")){
  vertexShapes <- "circle"
}else{
  vertexShapes <- dplyr::if_else(df_meta$"Immunogenicity"=="Positive", "circle", "square")
}
vertexLabels <- igraph::V(g)$"label"

### Cluster labels clusterCentroids <- df_meta %>% dplyr::group_by(ClusterID) %>% dplyr::summarise(x=mean(x), y=mean(y)) clusterCentroids\(x <- scales::rescale(clusterCentroids\)x, to=c(-1, 1)) clusterCentroids\(y <- scales::rescale(clusterCentroids\)y, to=c(-1, 1))

set.seed(seed)
pdf(paste0(FILEPATH,"/",PEPTIDE,"/Plot.pdf"),width = 10,height=10)
plot(
  igraph::cluster_walktrap(g), g,
  layout=layout,
  col=vertexColors,
  mark.border="black",
  mark.col=clusterColors,
  vertex.size=3,
  vertex.shape=vertexShapes,
  vertex.label=vertexLabels,
  vertex.label.color="black",
  vertex.label.cex=1.25,
  vertex.label.dist=0.5,
  vertex.label.family="sans",
  vertex.color=vertexColors,
  vertex.frame.color="black",
  edge.width=0.5,
  edge.arrow.size=0.25,
  edge.arrow.width=0.1,
  edge.color=scales::alpha("gray50", 0.75)
)
for(i in 1:length(clusterCentroids$ClusterID)){
  CLUSTER_IDS=data.table(CLUSTERID=paste0("Cluster",stringr::str_locate(clusterConsensusSeqs[[i]], "X")[1]))
  CLUSTER_IDS=CLUSTER_IDS %>% mutate(CLUSTERID = replace(CLUSTERID, grepl("NA",CLUSTERID), ""))

  text(
    rep(clusterCentroids$x[[i]], 2),
    clusterCentroids$y[[i]]+c(0.02, -0.1),
    c(CLUSTER_IDS$CLUSTERID, clusterConsensusSeqs[[i]]),
    pos=3, cex=1.25, family="sans"
  )
}
dev.off()
gc();gc()

}





# What are most escape prone epitopes?
## V1
- Take mean of pMHC for each WT-MT combo.
- Then calculate log ratio and visualise mean+/se
- This version was replaced as we're using the mean of a set of sample means here. Not accurate as different MT bind diffrent #s of MHC.
-
```{,dpi=300, fig.width = 12}

EPITOPE_ESCAPE_AVG=FULL_PREDICTIONS_DT %>% group_by(Peptide, AASeq2) %>% # For each WT-MT peptide combination.
        dplyr::summarise(meanMTScore=mean(Prediction), meanWuhanScore=mean(WuhanScore))%>%  # Average the MT scores and the Wuhan
        mutate(logsOdds = log(meanMTScore/meanWuhanScore)) %>% # Take the logs odds
        mutate(OmicronAASEQ2 = ifelse(AASeq2 %in% MUTATIONS$VariantAlignment, TRUE, FALSE))

# Summarise the data per epitope. Generate one mean logs odds across each epitope. Maybe rethink this. mean of a mean score
MUTS_DT=summarySE(EPITOPE_ESCAPE_AVG, measurevar="logsOdds", groupvars=c("Peptide"))

ESCAPE_POT_PER_PEPTIDE_PLT=MUTS_DT%>%
  ggplot(aes(x=reorder(Peptide, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Peptide")+ylab("Logs Odds ratio (MT/WT)")
ESCAPE_POT_PER_PEPTIDE_PLT
saveRDS(ESCAPE_POT_PER_PEPTIDE_PLT,file="ESCAPE_POT_PER_PEPTIDE_PLT.rds")

FULL_PREDICTIONS_DT %>% filter(Peptide %in% 'NGVEGFNCY')

13 V4 - distance based escape potential


EPITOPE_ESCAPE_AVG=FULL_PREDICTIONS_DT %>% mutate(Score = WuhanScore-Prediction)
# Summarise the data per epitope. Generate one mean logs odds across each epitope. Maybe rethink this. mean of a mean score
MUTS_DT=summarySE(EPITOPE_ESCAPE_AVG, measurevar="Score", groupvars=c("Peptide"))

ESCAPE_POT_PER_PEPTIDE_PLT=MUTS_DT%>%
  ggplot(aes(x=reorder(Peptide, -Score), y=Score))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=Score-se, ymax=Score+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Peptide")+ylab("Avg change in immunogenicity")
ESCAPE_POT_PER_PEPTIDE_PLT

13.1 V2 - taking a simply average. Makes more sense surely?

  • This makes the most sense. Takes a simple average of all WTi-PMHCX - MTi-PMHCX log ratios.
  • This is what we want to achieve: for each WT epitope an indication of how mutation affects it
colPal <- function(x){
      pal <- colorRamp(append(ggsci::pal_d3()(2), "white", after=1), space="rgb")
      cols <- pal(x)
      apply(cols, 1, function(x){rgb(x[1], x[2], x[3], maxColorValue=255)})
    }

WTMeans_DT=FULL_PREDICTIONS_DT %>% filter(Binder== "BINDER")%>% select(Peptide, Predicted_Binding,WuhanScore)%>% distinct()%>%group_by(Peptide)%>%
        dplyr::summarise(WTMean = mean(WuhanScore))

# Summarise the data per epitope. So for each epitope, average the logs odds.
MUTS_DT=summarySE(FULL_PREDICTIONS_DT, measurevar="logsOdds", groupvars=c("Peptide"))
MUTS_DT=MUTS_DT%>% inner_join(WTMeans_DT)
## Joining, by = "Peptide"
MUTS_DT$ImmunogenicityColor=colPal(MUTS_DT$WTMean)
MUTS_DT$ImmunogenicityColor = scales::alpha(MUTS_DT$ImmunogenicityColor, alpha=0.75)
MUTS_DT=MUTS_DT %>% mutate(logsOdds=logsOdds * -1)
MUTS_DT%>% select(Peptide, logsOdds, se)%>% dplyr::rename(EscapeScore=logsOdds) %>%
        readr::write_csv(file="/Users/paulbuckley/Nexus365/WIMM CCB - Koohy Group - Documents/Koohy Group/Effect_Mutation_Tcells/V11/WORKING_VERSION/Supplementary Tables/Table5_EscapeScores.csv")
ESCAPE_POT_PER_PEPTIDE_PLT=MUTS_DT %>%
  ggplot(aes(x=reorder(Peptide, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
    geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5,                    # Width of the error bars
                  position=position_dodge(.9))+theme_pubr(base_size = 16)+rotate_x_text()+xlab("Peptide")+
        ylab("Escape Score")#+ scale_fill_identity()+
        #labs(fill="WT Immunogenicity")#+ theme(panel.background = element_rect(fill = '#E3E3E3'))
ESCAPE_POT_PER_PEPTIDE_PLT

saveRDS(ESCAPE_POT_PER_PEPTIDE_PLT,file="ESCAPE_POT_PER_PEPTIDE_PLT.rds")

13.2 V3

  • ignore ```{,dpi=300, fig.width = 12}

EPITOPE_ESCAPE_AVG=FULL_PREDICTIONS_DT %>% mutate(PMHClogsOdds = log(Prediction/WuhanScore)) %>% group_by(Peptide, AASeq2) %>% dplyr::summarise(logsOdds=mean(PMHClogsOdds)) # Average the MT scores and the Wuhan

14 Summarise the data per epitope. Generate one mean logs odds across each epitope. Maybe rethink this. mean of a mean score

MUTS_DT=summarySE(EPITOPE_ESCAPE_AVG, measurevar=“logsOdds”, groupvars=c(“Peptide”))

ESCAPE_POT_PER_PEPTIDE_PLT=MUTS_DT%>% ggplot(aes(x=reorder(Peptide, -logsOdds), y=logsOdds))+geom_bar(stat = “identity”)+ geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour=“black”, width=.5, # Width of the error bars position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab(“Peptide”)+ylab(“Logs Odds ratio (MT/WT)”) ESCAPE_POT_PER_PEPTIDE_PLT




# Does fraction hydrophobicity correlate with transition potential?
-  in TCR contact vs. anchor

## EDA
- Minor correlation between hydrophobicity in anchor positions and the transition potential



```r
#CONTACT_POS=data.table(POSITIONS=c(3,4,5,6,7,8,9), Length=10)%>% rbind(
#data.table(POSITIONS=c(3,4,5,6,7,8), Length=9))%>%


HYDROPHOBIC_RESIDUES=chemistry %>% filter(group == "Hydrophobic")%>% pull(letter)

NINEMERS=MUTS_DT %>% filter(nchar(Peptide)==9) %>% mutate(TCRContact_Peptide = substr(Peptide, start=3, stop=8))
TENMERS = MUTS_DT %>% filter(nchar(Peptide)==10)%>% mutate(TCRContact_Peptide = substr(Peptide, start=3, stop=9))

MUTS_DT=NINEMERS %>% rbind(TENMERS)

MUTS_DT=MUTS_DT %>% mutate(TCR_HydrophobicCount =  stringi::stri_count_regex(MUTS_DT$TCRContact_Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>%
        mutate(TCR_HydroFraction = TCR_HydrophobicCount/nchar(TCRContact_Peptide))%>% select(!TCR_HydrophobicCount)

MUTS_DT %>% ggscatter(x="logsOdds",y="TCR_HydroFraction", add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE) # Add correlation coefficient. see ?stat_cor)
## `geom_smooth()` using formula 'y ~ x'

MUTS_DT=MUTS_DT %>% mutate(AnchorRes = stringr::str_remove(Peptide, TCRContact_Peptide))
MUTS_DT=MUTS_DT %>% mutate(Anchor_HydrophobicCount =  stringi::stri_count_regex(MUTS_DT$AnchorRes, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>%
        mutate(Anchor_HydroFraction = Anchor_HydrophobicCount/nchar(AnchorRes))%>% select(!Anchor_HydrophobicCount)

MUTS_DT %>% ggscatter(x="logsOdds",y="Anchor_HydroFraction", add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE) # Add correlation coefficient. see ?stat_cor)
## `geom_smooth()` using formula 'y ~ x'

14.1 Positional analysis: encode + XGBOOST

  • Trying to identify a link between features and outcome. I.E hydrophobicity in PX vs. Transition potential.

  • To do this we can analyse by XGBOOST. Robust to correlations in the data

  • First, create a DT with: Peptide, P1, P2, P3, P4. Each PX entry contains AA in that position.

  • dummy contrast encode the data, based on hydrophobicity or not in p1, p2, p3, p4.

  • Build a model and Use feature importance to determine any ;link between features and logs outs ### Start w/ 9mers: XGBOOST on Hydrophbic amino acid in PX. #### Linear regression using continuous target

  • Catgeorical variables binary encoded (dummy contrasted).

  • Predicting logs odds: using a linear regression

  • Result: Perhap hydrophobicity in P7 is related?

# Predict 9mers: logsOdds by categorical variables
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
#stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = "")

NINEMERS_AA_PX_DT=data.table(Peptide=NINEMERS$Peptide, logsOdds=NINEMERS$logsOdds) %>% cbind(stringr::str_split_fixed(NINEMERS$Peptide,n=9,pattern = ""))
#NINEMERS_AA_PX_DT %>% mutate(across(starts_with("V"), ~ifelse(.x=="A", "Hydrophobic","!Hydrophobic")))

NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT%>% as.data.table()%>%
        mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))

NINEMERS_AA_PX_MATRIX = NINEMERS_AA_PX_DT %>% select(!Peptide)
sparse_matrix = sparse.model.matrix(logsOdds ~ ., data = NINEMERS_AA_PX_MATRIX)[,-1]

bst = xgboost(data=sparse_matrix, label = NINEMERS_AA_PX_DT$logsOdds, eta=0.1, objective="reg:linear",max_depth=4,nthread=2, nrounds=15)
## [15:06:32] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1]  train-rmse:0.692266 
## [2]  train-rmse:0.660792 
## [3]  train-rmse:0.630174 
## [4]  train-rmse:0.603919 
## [5]  train-rmse:0.580132 
## [6]  train-rmse:0.556293 
## [7]  train-rmse:0.536487 
## [8]  train-rmse:0.516462 
## [9]  train-rmse:0.497865 
## [10] train-rmse:0.479217 
## [11] train-rmse:0.465259 
## [12] train-rmse:0.451268 
## [13] train-rmse:0.438826 
## [14] train-rmse:0.426022 
## [15] train-rmse:0.415385
importance = xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
importance
##    Feature       Gain      Cover  Frequency
## 1: V6Hydro 0.26323265 0.12313433 0.10731707
## 2: V7Hydro 0.24733192 0.09660033 0.13658537
## 3: V9Hydro 0.15385077 0.25497512 0.07317073
## 4: V1Hydro 0.09814379 0.10945274 0.13170732
## 5: V2Hydro 0.06675875 0.05928690 0.17073171
## 6: V5Hydro 0.05408275 0.10530680 0.10243902
## 7: V3Hydro 0.05207463 0.09991708 0.16097561
## 8: V8Hydro 0.03530849 0.04436153 0.06341463
## 9: V4Hydro 0.02921626 0.10696517 0.05365854
xgb.plot.importance(importance_matrix = importance)

14.1.0.1 logistic using binary target

  • Not clear. Different results based on different model predictor.
# Predict 9mers: binary outcome binary input

#stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = "")

NINEMERS_AA_PX_DT=data.table(Peptide=NINEMERS$Peptide, logsOdds=NINEMERS$logsOdds) %>% cbind(stringr::str_split_fixed(NINEMERS$Peptide,n=9,pattern = ""))
#NINEMERS_AA_PX_DT %>% mutate(across(starts_with("V"), ~ifelse(.x=="A", "Hydrophobic","!Hydrophobic")))

NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT%>% as.data.table()%>%
        mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))

NINEMERS_AA_PX_MATRIX = NINEMERS_AA_PX_DT %>% select(!Peptide)
sparse_matrix = sparse.model.matrix(logsOdds ~ ., data = NINEMERS_AA_PX_MATRIX)[,-1]
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT%>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
bst = xgboost(data=sparse_matrix, label = NINEMERS_AA_PX_DT$BINARY_OUT, eta=1, objective="binary:logistic",max_depth=4,nthread=2, nrounds=15)
## [1]  train-logloss:0.436531 
## [2]  train-logloss:0.375321 
## [3]  train-logloss:0.294124 
## [4]  train-logloss:0.261126 
## [5]  train-logloss:0.243249 
## [6]  train-logloss:0.234259 
## [7]  train-logloss:0.231075 
## [8]  train-logloss:0.228008 
## [9]  train-logloss:0.221525 
## [10] train-logloss:0.207540 
## [11] train-logloss:0.203544 
## [12] train-logloss:0.201184 
## [13] train-logloss:0.199824 
## [14] train-logloss:0.195798 
## [15] train-logloss:0.194051
importance = xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
importance
##    Feature        Gain      Cover  Frequency
## 1: V5Hydro 0.229909624 0.11152606 0.12820513
## 2: V2Hydro 0.218161826 0.09785625 0.15384615
## 3: V6Hydro 0.176017251 0.19396654 0.15384615
## 4: V3Hydro 0.115665095 0.14125067 0.10256410
## 5: V7Hydro 0.103601480 0.10456772 0.05128205
## 6: V4Hydro 0.070428880 0.07177072 0.10256410
## 7: V1Hydro 0.047210622 0.11959293 0.10256410
## 8: V9Hydro 0.032514794 0.09264624 0.12820513
## 9: V8Hydro 0.006490428 0.06682288 0.07692308
xgb.plot.importance(importance_matrix = importance)

NINEMERS_AA_PX_DT=data.table(Peptide=NINEMERS$Peptide, logsOdds=NINEMERS$logsOdds) %>% cbind(stringr::str_split_fixed(NINEMERS$Peptide,n=9,pattern = ""))
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT%>% as.data.table()%>%
        mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, 1,0))


#linear_model = lm(formula = logsOdds ~ V1+V2+V3+V4+V5+V6+V7+V8+V9, data=NINEMERS_AA_PX_DT)
#summary(linear_model)

14.1.0.2 Use boruta for feature importance

  • First graph shows variables (binarised hydrophobicity in PX) including anchor and TCR hydrofraction vs. logs odds score
  • Second graph shows variables (binarised hydrophobicity in PX) including anchor and TCR hydrofraction vs. binarised outcome.
  • Looking at binarised outcome suggests V7 again. Consistent with XGBOOST, but different predictor.
  • Not the most conclusive evidence
library(Boruta)
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT %>% inner_join(MUTS_DT %>% select(Peptide, Anchor_HydroFraction, TCR_HydroFraction))%>%
        dplyr::rename(AnchorHydro=Anchor_HydroFraction, TCRHydro=TCR_HydroFraction)
## Joining, by = "Peptide"
NINEMERS_AA_PX_DT_FULL=NINEMERS_AA_PX_DT
# Boruta using logsOdds out
boruta_output <- Boruta(logsOdds ~ ., data=NINEMERS_AA_PX_DT%>% select(! Peptide), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
## [1] meanImp  decision
## <0 rows> (or 0-length row.names)
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

# Using binary output
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT %>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
boruta_output <- Boruta(BINARY_OUT ~ ., data=NINEMERS_AA_PX_DT%>% select(! c(Peptide, logsOdds)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
##    meanImp  decision
## V7 7.79069 Confirmed
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

14.1.0.3 Try one hot code amino acid positions

  • One hot encode the presence of an amino acid at PX in WT.
  • V9Y means the wildtype has a Y in P9 of the sequence
  • A3, R3, Q4, 9Y potentially predictive vs. logs odds
library(mltools)
## 
## Attaching package: 'mltools'
## The following objects are masked from 'package:yardstick':
## 
##     mcc, rmse
## The following object is masked from 'package:tidyr':
## 
##     replace_na
NINEMERS_AA_PX_DT=data.table(Peptide=NINEMERS$Peptide, logsOdds=NINEMERS$logsOdds) %>% cbind(stringr::str_split_fixed(NINEMERS$Peptide,n=9,pattern = ""))

NINEMERS_AA_ONEHOT=caret::dummyVars(" ~ . ", NINEMERS_AA_PX_DT %>% select(! c(Peptide, logsOdds)))
newdata <- data.frame(predict(NINEMERS_AA_ONEHOT, newdata = NINEMERS_AA_PX_DT %>% select(! c(Peptide, logsOdds))))

NINEMERS_AA_ONEHOT=NINEMERS_AA_PX_DT %>% select(Peptide, logsOdds)%>% cbind(newdata)
NINEMERS_AA_ONEHOT=NINEMERS_AA_ONEHOT %>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
colnames(NINEMERS_AA_ONEHOT) = gsub("V","",colnames(NINEMERS_AA_ONEHOT))
NINEMERS_AA_PX_DT_FULL=NINEMERS_AA_PX_DT_FULL %>% inner_join(NINEMERS_AA_ONEHOT)
## Joining, by = c("Peptide", "logsOdds")
boruta_output <- Boruta(logsOdds ~ ., data=NINEMERS_AA_ONEHOT%>% select(! c(Peptide, BINARY_OUT)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
##        meanImp  decision
## `9Y` 16.228629 Confirmed
## `4Q`  8.600995 Confirmed
## `3R`  6.859537 Confirmed
## `3A`  6.371848 Confirmed
boruta_output
## Boruta performed 79 iterations in 2.26899 secs.
##  4 attributes confirmed important: `3A`, `3R`, `4Q`, `9Y`;
##  133 attributes confirmed unimportant: `1A`, `1C`, `1F`, `1G`, `1H` and
## 128 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

  • nothing predictive using binary out
boruta_output <- Boruta(BINARY_OUT ~ ., data=NINEMERS_AA_ONEHOT%>% select(! c(Peptide, logsOdds)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
## [1] meanImp  decision
## <0 rows> (or 0-length row.names)
boruta_output
## Boruta performed 14 iterations in 0.9822969 secs.
##  No attributes deemed important.
##  137 attributes confirmed unimportant: `1A`, `1C`, `1F`, `1G`, `1H` and
## 132 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

14.1.0.3.1 AA composition
  • Existence of ‘N’ in sequence is perhaps predictive.
# Simply AA composition

AACOMP_DT_NINEMERS=foreach::foreach(i=1:nrow(NINEMERS_AA_PX_DT),.combine = "rbind", .packages = c("dplyr","data.table","protr"))%do% {
  PEPTIDE = NINEMERS_AA_PX_DT %>% filter(row_number()==i) %>% pull(Peptide)
  AA_COMP = protr::extractAAC(PEPTIDE)
  DT=NINEMERS_AA_PX_DT %>% filter(row_number()==i) %>% select(Peptide, logsOdds)
  AA_COMP_DT= t(AA_COMP) %>% tibble::as.tibble()
  DT %>% cbind(AA_COMP_DT)
}
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
boruta_output <- Boruta(logsOdds ~ ., data=AACOMP_DT_NINEMERS%>% select(! c(Peptide)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
##    meanImp  decision
## N 6.089681 Confirmed
boruta_output
## Boruta performed 45 iterations in 0.841604 secs.
##  1 attributes confirmed important: N;
##  19 attributes confirmed unimportant: A, C, D, E, F and 14 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

linear_model = lm(formula = logsOdds ~ G+S+T+Y+C+N+Q+K+R+H+D+E+P+A+W+F+L+I+M+V, data=AACOMP_DT_NINEMERS)
summary(linear_model)
## 
## Call:
## lm(formula = logsOdds ~ G + S + T + Y + C + N + Q + K + R + H + 
##     D + E + P + A + W + F + L + I + M + V, data = AACOMP_DT_NINEMERS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8148 -0.3108  0.0427  0.2759  1.0856 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  2.57414    1.47689   1.743   0.0960 .
## G           -5.51339    1.99350  -2.766   0.0116 *
## S           -2.62162    1.77830  -1.474   0.1553  
## T            2.88371    2.77491   1.039   0.3105  
## Y           -2.63400    2.72076  -0.968   0.3440  
## C           -3.98467    3.30443  -1.206   0.2413  
## N           -4.36494    2.04596  -2.133   0.0448 *
## Q           -1.06714    2.38385  -0.448   0.6590  
## K            2.20786    1.92854   1.145   0.2652  
## R           -3.13333    2.32120  -1.350   0.1914  
## H           -3.02713    4.13155  -0.733   0.4718  
## D           -5.90712    3.09920  -1.906   0.0704 .
## E           -8.08605    3.06219  -2.641   0.0153 *
## P           -4.04966    2.09788  -1.930   0.0672 .
## A           -1.68112    2.63206  -0.639   0.5299  
## W           -9.24883    3.58652  -2.579   0.0175 *
## F           -2.47352    2.11475  -1.170   0.2552  
## L           -0.03142    2.47173  -0.013   0.9900  
## I           -1.31770    2.64432  -0.498   0.6234  
## M            1.84980    5.28006   0.350   0.7296  
## V                 NA         NA      NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5898 on 21 degrees of freedom
## Multiple R-squared:  0.6401, Adjusted R-squared:  0.3145 
## F-statistic: 1.966 on 19 and 21 DF,  p-value: 0.06776

14.1.0.4 combine all predictors for 9mers

  • Presence of N in wild type and existence of A in P3, A in P4, Q in P4, Y in P9 are perhaps predictive.
# cmobine everything
NINEMERS_AA_PX_DT_FULL=NINEMERS_AA_PX_DT_FULL %>% inner_join(AACOMP_DT_NINEMERS)
## Joining, by = c("Peptide", "logsOdds")
boruta_output <- Boruta(logsOdds ~ ., data=NINEMERS_AA_PX_DT_FULL%>% select(! c(Peptide, BINARY_OUT)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
##        meanImp  decision
## `9Y` 11.954615 Confirmed
## `4Q`  9.298192 Confirmed
## `3A`  6.358678 Confirmed
boruta_output
## Boruta performed 99 iterations in 3.308444 secs.
##  3 attributes confirmed important: `3A`, `4Q`, `9Y`;
##  163 attributes confirmed unimportant: A, AnchorHydro, C, D, E and 158
## more;
##  2 tentative attributes left: N, `8L`;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

14.2 Same analysis for 10mers

14.2.1 XGBOOST

  • No obvious results
TENMERS_AA_PX_DT=data.table(Peptide=TENMERS$Peptide, logsOdds=TENMERS$logsOdds) %>% cbind(stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = ""))
#NINEMERS_AA_PX_DT %>% mutate(across(starts_with("V"), ~ifelse(.x=="A", "Hydrophobic","!Hydrophobic")))

TENMERS_AA_PX_DT=TENMERS_AA_PX_DT%>% as.data.table()%>%
        mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
        mutate(V10 = ifelse(V10 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))

TENMERS_AA_PX_MATRIX = TENMERS_AA_PX_DT %>% select(!Peptide)
sparse_matrix = sparse.model.matrix(logsOdds ~ ., data = TENMERS_AA_PX_MATRIX)[,-1]
TENMERS_AA_PX_DT=TENMERS_AA_PX_DT%>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
bst = xgboost(data=sparse_matrix, label = TENMERS_AA_PX_DT$BINARY_OUT, eta=1, objective="binary:logistic",max_depth=4,nthread=2, nrounds=15)
## [1]  train-logloss:0.553929 
## [2]  train-logloss:0.493942 
## [3]  train-logloss:0.464173 
## [4]  train-logloss:0.439194 
## [5]  train-logloss:0.425441 
## [6]  train-logloss:0.414300 
## [7]  train-logloss:0.404312 
## [8]  train-logloss:0.393396 
## [9]  train-logloss:0.385126 
## [10] train-logloss:0.370233 
## [11] train-logloss:0.363904 
## [12] train-logloss:0.347948 
## [13] train-logloss:0.344436 
## [14] train-logloss:0.338114 
## [15] train-logloss:0.335126
importance = xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
importance
##     Feature        Gain      Cover  Frequency
## 1:  V5Hydro 0.216322121 0.13842264 0.16129032
## 2:  V1Hydro 0.208311883 0.17971085 0.16129032
## 3:  V3Hydro 0.184389950 0.22137734 0.19354839
## 4:  V6Hydro 0.141222662 0.12043871 0.09677419
## 5:  V7Hydro 0.126480483 0.12675823 0.16129032
## 6:  V8Hydro 0.052622959 0.04381815 0.03225806
## 7:  V2Hydro 0.049335904 0.08890103 0.09677419
## 8: V10Hydro 0.017674372 0.04644848 0.06451613
## 9:  V4Hydro 0.003639666 0.03412455 0.03225806
xgb.plot.importance(importance_matrix = importance)

TENMERS_AA_PX_DT=data.table(Peptide=TENMERS$Peptide, logsOdds=TENMERS$logsOdds) %>% cbind(stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = ""))
TENMERS_AA_PX_DT=TENMERS_AA_PX_DT%>% as.data.table()%>%
        mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
        mutate(V10 = ifelse(V10 %in% HYDROPHOBIC_RESIDUES, 1,0))



#linear_model = lm(formula = logsOdds ~ V1+V2+V3+V4+V5+V6+V7+V8+V9, data=NINEMERS_AA_PX_DT)
#summary(linear_model)

14.3 useboruta on binarised hydrophobicity in PX, incorporating anchor and tcr hydro fraction

  • nothing significant for both logs odds and binary out
TENMERS_AA_PX_DT=TENMERS_AA_PX_DT %>% inner_join(MUTS_DT %>% select(Peptide, Anchor_HydroFraction, TCR_HydroFraction))%>%
        dplyr::rename(AnchorHydro=Anchor_HydroFraction, TCRHydro=TCR_HydroFraction)
## Joining, by = "Peptide"
TENMERS_AA_PX_DT_FULL=TENMERS_AA_PX_DT

# Boruta using logsOdds out
boruta_output <- Boruta(logsOdds ~ ., data=TENMERS_AA_PX_DT%>% select(! Peptide), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
## [1] meanImp  decision
## <0 rows> (or 0-length row.names)
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

# Using binary output
TENMERS_AA_PX_DT=TENMERS_AA_PX_DT %>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
boruta_output <- Boruta(BINARY_OUT ~ ., data=TENMERS_AA_PX_DT%>% select(! c(Peptide, logsOdds)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
## [1] meanImp  decision
## <0 rows> (or 0-length row.names)
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

14.3.1 One hot encode again. V7L == 1, means WT contains L in P7.

  • LOGS ODDS: V3V, V7L importaant. Perhaps V5K, V6P, V7G/S, V8G
  • BINARY: V1F, V2E, V7G
TENMERS_AA_PX_DT=data.table(Peptide=TENMERS$Peptide, logsOdds=TENMERS$logsOdds) %>% cbind(stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = ""))

TENMERS_AA_ONEHOT=caret::dummyVars(" ~ . ", TENMERS_AA_PX_DT %>% select(! c(Peptide, logsOdds)))
newdata <- data.frame(predict(TENMERS_AA_ONEHOT, newdata = TENMERS_AA_PX_DT %>% select(! c(Peptide, logsOdds))))

TENMERS_AA_ONEHOT=TENMERS_AA_PX_DT %>% select(Peptide, logsOdds)%>% cbind(newdata)
TENMERS_AA_ONEHOT=TENMERS_AA_ONEHOT %>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
colnames(TENMERS_AA_ONEHOT) = gsub("V","",colnames(TENMERS_AA_ONEHOT))

TENMERS_AA_PX_DT_FULL=TENMERS_AA_PX_DT_FULL %>% inner_join(TENMERS_AA_ONEHOT)
## Joining, by = c("Peptide", "logsOdds")
boruta_output <- Boruta(logsOdds ~ ., data=TENMERS_AA_ONEHOT%>% select(! c(Peptide, BINARY_OUT)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
##      meanImp  decision
## `3` 6.713622 Confirmed
boruta_output
## Boruta performed 99 iterations in 3.086757 secs.
##  1 attributes confirmed important: `3`;
##  125 attributes confirmed unimportant: `10F`, `10I`, `10K`, `10L`,
## `10R` and 120 more;
##  7 tentative attributes left: `5K`, `5S`, `7G`, `7L`, `7S` and 2 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

boruta_output <- Boruta(BINARY_OUT ~ ., data=TENMERS_AA_ONEHOT%>% select(! c(Peptide, logsOdds)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
##        meanImp  decision
## `7G` 12.337362 Confirmed
## `1F`  9.320999 Confirmed
boruta_output
## Boruta performed 99 iterations in 2.760607 secs.
##  2 attributes confirmed important: `1F`, `7G`;
##  130 attributes confirmed unimportant: `10F`, `10I`, `10K`, `10L`,
## `10R` and 125 more;
##  1 tentative attributes left: `1Q`;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

14.3.2 AA comp

  • tentative A,G,N in WT.
# Simply AA composition
AACOMP_DT_TENMERS=foreach::foreach(i=1:nrow(TENMERS_AA_PX_DT),.combine = "rbind", .packages = c("dplyr","data.table","protr"))%do% {
  PEPTIDE = TENMERS_AA_PX_DT %>% filter(row_number()==i) %>% pull(Peptide)
  AA_COMP = protr::extractAAC(PEPTIDE)
  DT=TENMERS_AA_PX_DT %>% filter(row_number()==i) %>% select(Peptide, logsOdds)
  AA_COMP_DT= t(AA_COMP) %>% tibble::as.tibble()
  DT %>% cbind(AA_COMP_DT)
}

boruta_output <- Boruta(logsOdds ~ ., data=AACOMP_DT_TENMERS%>% select(! c(Peptide)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
##    meanImp  decision
## N 3.304996 Confirmed
## S 2.682743 Confirmed
boruta_output
## Boruta performed 99 iterations in 2.019348 secs.
##  No attributes deemed important.
##  16 attributes confirmed unimportant: C, D, E, F, H and 11 more;
##  4 tentative attributes left: A, G, N, S;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

linear_model = lm(formula = logsOdds ~ G+S+T+Y+C+N+Q+K+R+H+D+E+P+A+W+F+L+I+M+V, data=AACOMP_DT_TENMERS)
summary(linear_model)
## 
## Call:
## lm(formula = logsOdds ~ G + S + T + Y + C + N + Q + K + R + H + 
##     D + E + P + A + W + F + L + I + M + V, data = AACOMP_DT_TENMERS)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58330 -0.14425 -0.03811  0.22040  0.40244 
## 
## Coefficients: (2 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.5761     2.3214  -0.248    0.811
## G             1.0597     4.6133   0.230    0.825
## S             2.5072     2.7670   0.906    0.395
## T            -2.3503     3.0694  -0.766    0.469
## Y             3.3146     3.7571   0.882    0.407
## C            -2.9864     3.9186  -0.762    0.471
## N            -5.0087     2.7558  -1.818    0.112
## Q            -0.8728     2.4530  -0.356    0.732
## K             1.6939     2.5504   0.664    0.528
## R             4.6325     3.6065   1.284    0.240
## H             6.9650     6.6889   1.041    0.332
## D             1.9650     3.9476   0.498    0.634
## E            -1.6340     3.4608  -0.472    0.651
## P             2.5032     4.8145   0.520    0.619
## A             3.8156     3.6678   1.040    0.333
## W             5.4173     5.2457   1.033    0.336
## F             1.8980     2.6660   0.712    0.500
## L            -0.4852     3.3743  -0.144    0.890
## I            -0.2274     2.8460  -0.080    0.939
## M                 NA         NA      NA       NA
## V                 NA         NA      NA       NA
## 
## Residual standard error: 0.4593 on 7 degrees of freedom
## Multiple R-squared:  0.8026, Adjusted R-squared:  0.2949 
## F-statistic: 1.581 on 18 and 7 DF,  p-value: 0.2766

14.3.2.1 everything: only V7l, and perhaps N.

# cmobine everything
TENMERS_AA_PX_DT_FULL=TENMERS_AA_PX_DT_FULL %>% inner_join(AACOMP_DT_TENMERS)
## Joining, by = c("Peptide", "logsOdds")
boruta_output <- Boruta(logsOdds ~ ., data=TENMERS_AA_PX_DT_FULL%>% select(! c(Peptide, BINARY_OUT)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ])  # descending sort
##       meanImp  decision
## `7L` 7.212546 Confirmed
boruta_output
## Boruta performed 99 iterations in 3.222649 secs.
##  1 attributes confirmed important: `7L`;
##  159 attributes confirmed unimportant: AnchorHydro, C, D, E, F and 154
## more;
##  5 tentative attributes left: A, `3L`, `3`, `5K`, `7G`;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")

chemistry %>% DT::datatable()

14.4 Take a simpler approach: 9mers

#EPITOPE_ESCAPE_AVG=FULL_PREDICTIONS_DT %>% mutate(Score = WuhanScore-Prediction)
# Summarise the data per epitope. Generate one mean logs odds across each epitope. Maybe rethink this. mean of a mean score
#MUTS_DT=summarySE(EPITOPE_ESCAPE_AVG, measurevar="Score", groupvars=c("Peptide"))

HIGH_SET = MUTS_DT%>% slice_max(order_by = logsOdds, n=9) %>% mutate(Dataset = "High")
LOW_SET = MUTS_DT%>% slice_min(order_by = logsOdds, n=9) %>% mutate(Dataset = "Negative")

LOW_HIGH_DT= LOW_SET %>% rbind(HIGH_SET)

my_comparisons = list(c("High","Negative"))

14.4.1 Look at fraction of polar, neutral, basic, acidic.

REMAINING_CHEMISTRY=chemistry #%>% filter(! group == "Hydrophobic")

#REMAINING_CHEMISTRY_MUTS_DT=foreach::foreach(i=1:length(unique(REMAINING_CHEMISTRY$group)), .combine = "rbind", .packages = c("dplyr","data.table"))%do% {
 # CHEMISTRY_GRP = unique(REMAINING_CHEMISTRY$group)[i]
 # CHEMISTRY_AAS = REMAINING_CHEMISTRY %>% filter(group == CHEMISTRY_GRP)%>% pull(letter)
 # MUTS_DT %>% select(Peptide,logsOdds, TCRContact_Peptide, AnchorRes)%>%
 #         mutate(Anchor_Count =  stringi::stri_count_regex(MUTS_DT$AnchorRes, paste0(CHEMISTRY_AAS,collapse = "|"))) %>%
 #         mutate(Anchor_Fraction = Anchor_Count/nchar(AnchorRes))%>% select(!Anchor_Count)%>% mutate(group=CHEMISTRY_GRP)%>%
 #         mutate(TCRContact_Count =  stringi::stri_count_regex(MUTS_DT$TCRContact_Peptide, paste0(CHEMISTRY_AAS,collapse = "|"))) %>%
 #         mutate(TCRContact_Fraction = TCRContact_Count/nchar(TCRContact_Peptide))%>% select(!TCRContact_Count)%>% mutate(group=CHEMISTRY_GRP)
#}

REMAINING_CHEMISTRY_MUTS_DT=foreach::foreach(i=1:length(unique(REMAINING_CHEMISTRY$group)), .combine = "rbind", .packages = c("dplyr","data.table"))%do% {
  CHEMISTRY_GRP = unique(REMAINING_CHEMISTRY$group)[i]
  CHEMISTRY_AAS = REMAINING_CHEMISTRY %>% filter(group == CHEMISTRY_GRP)%>% pull(letter)
  MUTS_DT %>% select(Peptide,logsOdds, TCRContact_Peptide, AnchorRes)%>%
          mutate(Anchor_Count =  stringi::stri_count_regex(MUTS_DT$AnchorRes, paste0(CHEMISTRY_AAS,collapse = "|"))) %>%
          mutate(Anchor_Fraction = Anchor_Count/nchar(AnchorRes))%>% select(!Anchor_Count)%>% mutate(group=CHEMISTRY_GRP)%>%
          mutate(TCRContact_Count =  stringi::stri_count_regex(MUTS_DT$TCRContact_Peptide, paste0(CHEMISTRY_AAS,collapse = "|"))) %>%
          mutate(TCRContact_Fraction = TCRContact_Count/nchar(TCRContact_Peptide))%>% select(!TCRContact_Count)%>% mutate(group=CHEMISTRY_GRP)
}

LOW_SET = REMAINING_CHEMISTRY_MUTS_DT%>% filter(Peptide %in% LOW_SET$Peptide)%>% mutate(Dataset = "Negative")#
HIGH_SET = REMAINING_CHEMISTRY_MUTS_DT%>% filter(Peptide %in% HIGH_SET$Peptide)%>% mutate(Dataset = "High")
TOLERANT_SET = REMAINING_CHEMISTRY_MUTS_DT%>% filter(Peptide %in% c("TLACFVLAAV","YYHKNNKSW","FGEVFNATRF","QTGKIADYNY","QWNLVIGFLF","GEVFNATRF","FQPTNGVGY",
                                                                    "SYQTQTNSPR","DSKVGGNYNY"))%>% mutate(Dataset = "Negligible")
#TOLERANT_SET = REMAINING_CHEMISTRY_MUTS_DT%>% filter(Peptide %in% c("GEVFNATRF","SYQTQTNSPR","FGEVFNATRF","CNDPFLGVYY","HVSGTNGTK"))%>% mutate(Dataset = "Negligible")
LOW_HIGH_DT= LOW_SET %>% rbind(HIGH_SET)%>% rbind(TOLERANT_SET)%>% mutate(Dataset = factor(Dataset, levels = c("High","Negligible","Negative")))
#ENHANCED_IMP_DT %>%
#        ggplot(aes(x=Dataset, y=Anchor_Fraction, fill=Dataset))+
#        geom_dotplot(binaxis="y", stackdir="center",stackratio=1,dotsize = 0.7,binpositions = "all")+
#        theme_pubr(base_size = 16) +
#        facet_grid(~group)+
#        stat_summary(fun=mean, geom="crossbar", width=0.7) +
#        stat_compare_means(comparisons = my_comparisons, label="p.signif")

HYDROPHOBICITY_ANCHORFREQ_PLT=LOW_HIGH_DT %>% filter(group == "Hydrophobic") %>%
        ggviolin(x="Dataset",y="Anchor_Fraction",color="Dataset", add=c("boxplot","jitter"),trim=TRUE,palette = "jco")+
        stat_compare_means(comparisons = my_comparisons, label="p.signif",tip.length = 0.05,label.y=1.09)+
        theme_pubr(base_size = 16)+ylab("Freq. of hydrophobic residues") +
        facet_grid(~group)
HYDROPHOBICITY_ANCHORFREQ_PLT

#HYDROPHOBICITY_ANCHORFREQ_PLT=LOW_HIGH_DT %>% filter(group == "Hydrophobic") %>%
 #       ggboxplot(x="Dataset",y="Anchor_Fraction",color="Dataset", add=c("jitter"),palette = "jco")+
  #      stat_compare_means(comparisons = my_comparisons, label="p.signif",tip.length = 0.05,label.y=1.05)+
   #     theme_pubr(base_size = 16)+ylab("Freq. of hydrophobic residues") +
    #    facet_grid(~group)
#HYDROPHOBICITY_ANCHORFREQ_PLT

#HYDROPHOBICITY_ANCHORFREQ_PLT=LOW_HIGH_DT %>% filter(group == "Hydrophobic") %>%
 #       ggplot(aes(x=Dataset, y=Anchor_Fraction, color=Dataset))+geom_violin()+geom_jitter()+
  #      stat_compare_means(comparisons = my_comparisons, label="p.signif",tip.length = 0.05,label.y=1.09)+
   #     theme_pubr(base_size = 16)+ylab("Freq. of hydrophobic residues") +
    #    facet_grid(~group)
#HYDROPHOBICITY_ANCHORFREQ_PLT


saveRDS(HYDROPHOBICITY_ANCHORFREQ_PLT,file="HYDROPHOBICITY_ANCHORFREQ_PLT.rds")

#ENHANCED_IMP_DT %>%
 #       ggviolin(x="Dataset",y="TCRContact_Fraction",color="Dataset", add=c("boxplot","jitter"),palette = "jco")+
  #      stat_compare_means(comparisons = my_comparisons, label="p.signif",label.y = 1.6,label.sep = 0.2)+
   #     theme_pubr(base_size = 16) +
    #    facet_grid(~group)

14.5 Compare mutagenesis analysis of groups

  • What separates Tolerant vs impaired? hlas bound? number of hlas?
HLA_TRANSITIONPOT_GRPS_PLT=FULL_PREDICTIONS_DT %>% inner_join(LOW_HIGH_DT %>% select(Peptide, Dataset) %>% distinct())%>%
        group_by(Peptide, Dataset)%>%filter(Binder == "BINDER")%>%
        dplyr::summarise(n=n_distinct(Predicted_Binding))%>%
        ggboxplot(x="Dataset",y="n",add="jitter",color="Dataset",palette = "jco")+theme_pubr(base_size = 16)+stat_compare_means(comparisons = my_comparisons,label="p.signif")+ylab("# HLAs bound")
## Joining, by = "Peptide"
## `summarise()` has grouped output by 'Peptide'. You can override using the `.groups` argument.
HLA_TRANSITIONPOT_GRPS_PLT

saveRDS(HLA_TRANSITIONPOT_GRPS_PLT,file="HLA_TRANSITIONPOT_GRPS_PLT.rds")

14.6 What residues and positions of e.g.. hydrophobic residues are enriched in grups?

NINE_LOW_HIGH_DT = LOW_HIGH_DT%>% select(Peptide, Dataset)%>% distinct() %>% filter(nchar(Peptide)==9)
NINE_LOW_HIGH_DT=NINE_LOW_HIGH_DT%>% cbind(stringr::str_split_fixed(NINE_LOW_HIGH_DT$Peptide,n=9,pattern = ""))
TEN_LOW_HIGH_DT = LOW_HIGH_DT%>% select(Peptide, Dataset)%>% distinct() %>% filter(nchar(Peptide)==10)
TEN_LOW_HIGH_DT=TEN_LOW_HIGH_DT%>% cbind(stringr::str_split_fixed(TEN_LOW_HIGH_DT$Peptide,n=10,pattern = ""))



#NINE_ENHANCED_IMP_DT%>% mutate("10"="-")%>% rbind(TEN_ENHANCED_IMP_DT)%>%
 #       pivot_longer(cols = `1`:`10`)%>%dplyr::rename(letter=value,position=name)%>%
  #      inner_join(chemistry)%>%filter(position %in% c(1,2,9,10))%>%
   #
#ggplot(aes(x=Peptide, y=letter, color=group))+geom_point()+facet_wrap(~position+Dataset,scales="free_x",nrow=4)+rotate_x_text()
NINEMER_AA_BACKGROUND=REMAINING_CHEMISTRY_MUTS_DT %>% select(Peptide)%>% distinct() %>% filter(nchar(Peptide)==9)
NINEMER_AA_BACKGROUND=NINEMER_AA_BACKGROUND%>% cbind(stringr::str_split_fixed(NINEMER_AA_BACKGROUND$Peptide,n=9,pattern = ""))

TENMER_AA_BACKGROUND=REMAINING_CHEMISTRY_MUTS_DT %>% select(Peptide)%>% distinct() %>% filter(nchar(Peptide)==10)
TENMER_AA_BACKGROUND=TENMER_AA_BACKGROUND%>% cbind(stringr::str_split_fixed(TENMER_AA_BACKGROUND$Peptide,n=10,pattern = ""))

BACKGROUND_AA_DISTR=NINEMER_AA_BACKGROUND %>% mutate("10"="-") %>% rbind(TENMER_AA_BACKGROUND)
BACKGROUND_AA_DISTR=BACKGROUND_AA_DISTR%>%
        pivot_longer(cols = `1`:`10`)%>%dplyr::rename(letter=value,position=name)%>% ungroup()

SAMPLED_BACKGROUND_AA_DISTR=foreach::foreach(i=1:10, .combine = "rbind", .packages = c("dplyr","data.table"))%do% {
    SAMPLED_PEPS=BACKGROUND_AA_DISTR %>% select(Peptide) %>% distinct()%>% sample_n(size=5)%>% pull(Peptide)
    BACKGROUND_AA_DISTR %>% filter(Peptide %in% SAMPLED_PEPS)%>% group_by(letter)%>% dplyr::summarise(n=n())%>% mutate(ID=i)
}

SAMPLED_BACKGROUND_AA_DISTR=SAMPLED_BACKGROUND_AA_DISTR %>% mutate(Dataset = "CTRL")%>% select(!ID)%>% inner_join(chemistry %>% select(! col))
## Joining, by = "letter"
AA_COMP_TRANSITIONPOT_GRPS=NINE_LOW_HIGH_DT%>% mutate("10"="-")%>% rbind(TEN_LOW_HIGH_DT)%>%
        pivot_longer(cols = `1`:`10`)%>%dplyr::rename(letter=value,position=name)%>%
        inner_join(chemistry)%>% group_by(Dataset, letter, group) %>% dplyr::summarise(n=n())%>%
        rbind(SAMPLED_BACKGROUND_AA_DISTR)%>%
        group_by(Dataset, letter, group)%>% dplyr::summarise(meanN=mean(n), sd=sd(n))%>%
        filter(group == "Hydrophobic")%>%
        ggplot(aes(x=letter, y=meanN, fill=Dataset))+geom_bar(stat="identity",position=position_dodge())+
        geom_errorbar(aes(ymin=meanN-sd, ymax=meanN+sd), width=.2, position=position_dodge(.9))+
        facet_wrap(~group,scales="free",nrow=1)+theme_pubr(base_size = 16)+ylab("Count")+xlab("Amino Acid")
## Joining, by = "letter"
## `summarise()` has grouped output by 'Dataset', 'letter'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'Dataset', 'letter'. You can override using the `.groups` argument.
AA_COMP_TRANSITIONPOT_GRPS

saveRDS(AA_COMP_TRANSITIONPOT_GRPS,file="AA_COMP_TRANSITIONPOT_GRPS.rds")

15 contact positions

15.1 9mers

HYDRO_AA=chemistry %>% filter(group=="Hydrophobic")%>% pull(letter)

LENGTH=9
POSITIONS = c(3,4,5,6,7,8)

MUTS_TO_ANALYSE=FULL_PREDICTIONS_DT %>% filter(ChangeTo %in% HYDRO_AA)%>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3)%>% pull(ChangeToPos)

FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
        ggscatter(x="logsOdds",y="MT_MHCScore", # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
   cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
        add="loess")
## `geom_smooth()` using formula 'y ~ x'

FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
        ggscatter(x="logsOdds",y="TRAPP_MUTANT_Prediction", # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
   cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
        add="loess")
## `geom_smooth()` using formula 'y ~ x'

15.2 10mers

LENGTH=10
POSITIONS = c(3,4,5,6,7,8,9)

MUTS_TO_ANALYSE=FULL_PREDICTIONS_DT %>% filter(ChangeTo %in% HYDRO_AA)%>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3)%>% pull(ChangeToPos)

FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
        ggscatter(x="logsOdds",y="MT_MHCScore", # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
   cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
        add="loess")
## `geom_smooth()` using formula 'y ~ x'

FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
        ggscatter(x="logsOdds",y="TRAPP_MUTANT_Prediction", # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
   cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
        add="loess")
## `geom_smooth()` using formula 'y ~ x'

16 anchor

16.1 9mers

HYDRO_AA=chemistry %>% filter(group=="Hydrophobic")%>% pull(letter)

LENGTH=9
POSITIONS = c(1,2,9)

MUTS_TO_ANALYSE=FULL_PREDICTIONS_DT %>% filter(ChangeTo %in% HYDRO_AA)%>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3)%>% pull(ChangeToPos)

FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
        ggscatter(x="logsOdds",y="MT_MHCScore", # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
   cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
        add="loess")
## `geom_smooth()` using formula 'y ~ x'

FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
        ggscatter(x="logsOdds",y="TRAPP_MUTANT_Prediction", # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
   cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
        add="loess")
## `geom_smooth()` using formula 'y ~ x'

16.2 10mers

LENGTH=10
POSITIONS = c(1,2,10)

MUTS_TO_ANALYSE=FULL_PREDICTIONS_DT %>% filter(ChangeTo %in% HYDRO_AA)%>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
        mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
        dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
        slice_max(order_by = meanlogsOdds, prop = 0.3)%>% pull(ChangeToPos)

FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
        ggscatter(x="logsOdds",y="MT_MHCScore", # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
   cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
        add="loess")
## `geom_smooth()` using formula 'y ~ x'

FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
        ggscatter(x="logsOdds",y="TRAPP_MUTANT_Prediction", # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
   cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
        add="loess")
## `geom_smooth()` using formula 'y ~ x'